Groupby

Groupby#

[1]:

import xarray
import climtas
import dask.array
import pandas
import numpy

Say we have daily input data for several years, that we want to convert to a daily mean climatology

[2]:

time = pandas.date_range('20010101', '20040101', freq='D', closed='left')

data = dask.array.random.random((len(time),50,100), chunks=(90,25,25))
lat = numpy.linspace(-90, 90, data.shape[1])
lon = numpy.linspace(-180, 180, data.shape[2], endpoint=False)

da = xarray.DataArray(data, coords=[('time', time), ('lat', lat), ('lon', lon)], name='temperature')
da

[2]:

<xarray.DataArray 'temperature' (time: 1095, lat: 50, lon: 100)>
dask.array<random_sample, shape=(1095, 50, 100), dtype=float64, chunksize=(90, 25, 25), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 2001-01-01 2001-01-02 ... 2003-12-31
  * lat      (lat) float64 -90.0 -86.33 -82.65 -78.98 ... 78.98 82.65 86.33 90.0
  * lon      (lon) float64 -180.0 -176.4 -172.8 -169.2 ... 169.2 172.8 176.4

xarray.DataArray

'temperature'

time: 1095
lat: 50
lon: 100

dask.array<chunksize=(90, 25, 25), meta=np.ndarray>

	Array	Chunk
Bytes	41.77 MiB	439.45 kiB
Shape	(1095, 50, 100)	(90, 25, 25)
Count	104 Tasks	104 Chunks
Type	float64	numpy.ndarray

Coordinates: (3)

time

(time)

datetime64[ns]

2001-01-01 ... 2003-12-31

array(['2001-01-01T00:00:00.000000000', '2001-01-02T00:00:00.000000000',
       '2001-01-03T00:00:00.000000000', ..., '2003-12-29T00:00:00.000000000',
       '2003-12-30T00:00:00.000000000', '2003-12-31T00:00:00.000000000'],
      dtype='datetime64[ns]')

lat

(lat)

float64

-90.0 -86.33 -82.65 ... 86.33 90.0

array([-90.      , -86.326531, -82.653061, -78.979592, -75.306122, -71.632653,
       -67.959184, -64.285714, -60.612245, -56.938776, -53.265306, -49.591837,
       -45.918367, -42.244898, -38.571429, -34.897959, -31.22449 , -27.55102 ,
       -23.877551, -20.204082, -16.530612, -12.857143,  -9.183673,  -5.510204,
        -1.836735,   1.836735,   5.510204,   9.183673,  12.857143,  16.530612,
        20.204082,  23.877551,  27.55102 ,  31.22449 ,  34.897959,  38.571429,
        42.244898,  45.918367,  49.591837,  53.265306,  56.938776,  60.612245,
        64.285714,  67.959184,  71.632653,  75.306122,  78.979592,  82.653061,
        86.326531,  90.      ])

lon

(lon)

float64

-180.0 -176.4 ... 172.8 176.4

array([-180. , -176.4, -172.8, -169.2, -165.6, -162. , -158.4, -154.8, -151.2,
       -147.6, -144. , -140.4, -136.8, -133.2, -129.6, -126. , -122.4, -118.8,
       -115.2, -111.6, -108. , -104.4, -100.8,  -97.2,  -93.6,  -90. ,  -86.4,
        -82.8,  -79.2,  -75.6,  -72. ,  -68.4,  -64.8,  -61.2,  -57.6,  -54. ,
        -50.4,  -46.8,  -43.2,  -39.6,  -36. ,  -32.4,  -28.8,  -25.2,  -21.6,
        -18. ,  -14.4,  -10.8,   -7.2,   -3.6,    0. ,    3.6,    7.2,   10.8,
         14.4,   18. ,   21.6,   25.2,   28.8,   32.4,   36. ,   39.6,   43.2,
         46.8,   50.4,   54. ,   57.6,   61.2,   64.8,   68.4,   72. ,   75.6,
         79.2,   82.8,   86.4,   90. ,   93.6,   97.2,  100.8,  104.4,  108. ,
        111.6,  115.2,  118.8,  122.4,  126. ,  129.6,  133.2,  136.8,  140.4,
        144. ,  147.6,  151.2,  154.8,  158.4,  162. ,  165.6,  169.2,  172.8,
        176.4])

Attributes: (0)

The Xarray way is to use xarray.DataArray.groupby, however that is an expensive function to run - we started with 104 tasks and 104 chunks in the Dask graph, and this has exploded to 23,464 tasks and 2920 chunks. For a large dataset this increase in chunk counts really bogs down Dask.

The reason for this is that with groupby Xarray will create a new output chunk for each individual day - you can see the chunk size of the output is now (1, 25, 25).

[3]:

da.groupby('time.dayofyear').mean()

[3]:

<xarray.DataArray 'temperature' (dayofyear: 365, lat: 50, lon: 100)>
dask.array<stack, shape=(365, 50, 100), dtype=float64, chunksize=(1, 25, 25), chunktype=numpy.ndarray>
Coordinates:
  * lat        (lat) float64 -90.0 -86.33 -82.65 -78.98 ... 82.65 86.33 90.0
  * lon        (lon) float64 -180.0 -176.4 -172.8 -169.2 ... 169.2 172.8 176.4
  * dayofyear  (dayofyear) int64 1 2 3 4 5 6 7 8 ... 359 360 361 362 363 364 365

xarray.DataArray

'temperature'

dayofyear: 365
lat: 50
lon: 100

dask.array<chunksize=(1, 25, 25), meta=np.ndarray>

	Array	Chunk
Bytes	13.92 MiB	4.88 kiB
Shape	(365, 50, 100)	(1, 25, 25)
Count	23464 Tasks	2920 Chunks
Type	float64	numpy.ndarray

Coordinates: (3)

lat

(lat)

float64

-90.0 -86.33 -82.65 ... 86.33 90.0

array([-90.      , -86.326531, -82.653061, -78.979592, -75.306122, -71.632653,
       -67.959184, -64.285714, -60.612245, -56.938776, -53.265306, -49.591837,
       -45.918367, -42.244898, -38.571429, -34.897959, -31.22449 , -27.55102 ,
       -23.877551, -20.204082, -16.530612, -12.857143,  -9.183673,  -5.510204,
        -1.836735,   1.836735,   5.510204,   9.183673,  12.857143,  16.530612,
        20.204082,  23.877551,  27.55102 ,  31.22449 ,  34.897959,  38.571429,
        42.244898,  45.918367,  49.591837,  53.265306,  56.938776,  60.612245,
        64.285714,  67.959184,  71.632653,  75.306122,  78.979592,  82.653061,
        86.326531,  90.      ])

lon

(lon)

float64

-180.0 -176.4 ... 172.8 176.4

array([-180. , -176.4, -172.8, -169.2, -165.6, -162. , -158.4, -154.8, -151.2,
       -147.6, -144. , -140.4, -136.8, -133.2, -129.6, -126. , -122.4, -118.8,
       -115.2, -111.6, -108. , -104.4, -100.8,  -97.2,  -93.6,  -90. ,  -86.4,
        -82.8,  -79.2,  -75.6,  -72. ,  -68.4,  -64.8,  -61.2,  -57.6,  -54. ,
        -50.4,  -46.8,  -43.2,  -39.6,  -36. ,  -32.4,  -28.8,  -25.2,  -21.6,
        -18. ,  -14.4,  -10.8,   -7.2,   -3.6,    0. ,    3.6,    7.2,   10.8,
         14.4,   18. ,   21.6,   25.2,   28.8,   32.4,   36. ,   39.6,   43.2,
         46.8,   50.4,   54. ,   57.6,   61.2,   64.8,   68.4,   72. ,   75.6,
         79.2,   82.8,   86.4,   90. ,   93.6,   97.2,  100.8,  104.4,  108. ,
        111.6,  115.2,  118.8,  122.4,  126. ,  129.6,  133.2,  136.8,  140.4,
        144. ,  147.6,  151.2,  154.8,  158.4,  162. ,  165.6,  169.2,  172.8,
        176.4])

dayofyear
(dayofyear)
int64
1 2 3 4 5 6 ... 361 362 363 364 365
```
array([  1,   2,   3, ..., 363, 364, 365])
```

Attributes: (0)

climtas.blocked.blocked_groupby will as much as possible limit the number of chunks created/ It does this by reshaping the array, stacking individual years, then reducing over the new stacked axis rather than using Pandas indexing operations. It does however require the input data to be evenly spaced in time, which well-behaved datasets should be.

[4]:

climtas.blocked_groupby(da, time='dayofyear').mean()

[4]:

<xarray.DataArray 'stack-f4e41af3171d33521253e01e4a44f4a5' (dayofyear: 366, lat: 50, lon: 100)>
dask.array<mean_agg-aggregate, shape=(366, 50, 100), dtype=float64, chunksize=(80, 25, 25), chunktype=numpy.ndarray>
Coordinates:
  * lat        (lat) float64 -90.0 -86.33 -82.65 -78.98 ... 82.65 86.33 90.0
  * lon        (lon) float64 -180.0 -176.4 -172.8 -169.2 ... 169.2 172.8 176.4
  * dayofyear  (dayofyear) int64 1 2 3 4 5 6 7 8 ... 360 361 362 363 364 365 366

xarray.DataArray

'stack-f4e41af3171d33521253e01e4a44f4a5'

dayofyear: 366
lat: 50
lon: 100

dask.array<chunksize=(80, 25, 25), meta=np.ndarray>

	Array	Chunk
Bytes	13.96 MiB	390.62 kiB
Shape	(366, 50, 100)	(80, 25, 25)
Count	1792 Tasks	112 Chunks
Type	float64	numpy.ndarray

Coordinates: (3)

lat

(lat)

float64

-90.0 -86.33 -82.65 ... 86.33 90.0

array([-90.      , -86.326531, -82.653061, -78.979592, -75.306122, -71.632653,
       -67.959184, -64.285714, -60.612245, -56.938776, -53.265306, -49.591837,
       -45.918367, -42.244898, -38.571429, -34.897959, -31.22449 , -27.55102 ,
       -23.877551, -20.204082, -16.530612, -12.857143,  -9.183673,  -5.510204,
        -1.836735,   1.836735,   5.510204,   9.183673,  12.857143,  16.530612,
        20.204082,  23.877551,  27.55102 ,  31.22449 ,  34.897959,  38.571429,
        42.244898,  45.918367,  49.591837,  53.265306,  56.938776,  60.612245,
        64.285714,  67.959184,  71.632653,  75.306122,  78.979592,  82.653061,
        86.326531,  90.      ])

lon

(lon)

float64

-180.0 -176.4 ... 172.8 176.4

array([-180. , -176.4, -172.8, -169.2, -165.6, -162. , -158.4, -154.8, -151.2,
       -147.6, -144. , -140.4, -136.8, -133.2, -129.6, -126. , -122.4, -118.8,
       -115.2, -111.6, -108. , -104.4, -100.8,  -97.2,  -93.6,  -90. ,  -86.4,
        -82.8,  -79.2,  -75.6,  -72. ,  -68.4,  -64.8,  -61.2,  -57.6,  -54. ,
        -50.4,  -46.8,  -43.2,  -39.6,  -36. ,  -32.4,  -28.8,  -25.2,  -21.6,
        -18. ,  -14.4,  -10.8,   -7.2,   -3.6,    0. ,    3.6,    7.2,   10.8,
         14.4,   18. ,   21.6,   25.2,   28.8,   32.4,   36. ,   39.6,   43.2,
         46.8,   50.4,   54. ,   57.6,   61.2,   64.8,   68.4,   72. ,   75.6,
         79.2,   82.8,   86.4,   90. ,   93.6,   97.2,  100.8,  104.4,  108. ,
        111.6,  115.2,  118.8,  122.4,  126. ,  129.6,  133.2,  136.8,  140.4,
        144. ,  147.6,  151.2,  154.8,  158.4,  162. ,  165.6,  169.2,  172.8,
        176.4])

dayofyear
(dayofyear)
int64
1 2 3 4 5 6 ... 362 363 364 365 366
```
array([  1,   2,   3, ..., 364, 365, 366])
```

Attributes: (0)

climtas 0.3.2+9.g68ddf31.dirty documentation

Groupby

Groupby#