Resample

Resample#

[1]:

import xarray
import climtas
import dask.array
import pandas
import numpy

Say we have hourly input data for a year that we want to convert to daily data

[2]:

time = pandas.date_range('20010101', '20020101', freq='H', closed='left')

data = dask.array.random.random((len(time),50,100), chunks=(24*60,25,25))
lat = numpy.linspace(-90, 90, data.shape[1])
lon = numpy.linspace(-180, 180, data.shape[2], endpoint=False)

da = xarray.DataArray(data, coords=[('time', time), ('lat', lat), ('lon', lon)], name='temperature')
da

[2]:

<xarray.DataArray 'temperature' (time: 8760, lat: 50, lon: 100)>
dask.array<random_sample, shape=(8760, 50, 100), dtype=float64, chunksize=(1440, 25, 25), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 2001-01-01 ... 2001-12-31T23:00:00
  * lat      (lat) float64 -90.0 -86.33 -82.65 -78.98 ... 78.98 82.65 86.33 90.0
  * lon      (lon) float64 -180.0 -176.4 -172.8 -169.2 ... 169.2 172.8 176.4

xarray.DataArray

'temperature'

time: 8760
lat: 50
lon: 100

dask.array<chunksize=(1440, 25, 25), meta=np.ndarray>

	Array	Chunk
Bytes	334.17 MiB	6.87 MiB
Shape	(8760, 50, 100)	(1440, 25, 25)
Count	56 Tasks	56 Chunks
Type	float64	numpy.ndarray

Coordinates: (3)

time

(time)

datetime64[ns]

2001-01-01 ... 2001-12-31T23:00:00

array(['2001-01-01T00:00:00.000000000', '2001-01-01T01:00:00.000000000',
       '2001-01-01T02:00:00.000000000', ..., '2001-12-31T21:00:00.000000000',
       '2001-12-31T22:00:00.000000000', '2001-12-31T23:00:00.000000000'],
      dtype='datetime64[ns]')

lat

(lat)

float64

-90.0 -86.33 -82.65 ... 86.33 90.0

array([-90.      , -86.326531, -82.653061, -78.979592, -75.306122, -71.632653,
       -67.959184, -64.285714, -60.612245, -56.938776, -53.265306, -49.591837,
       -45.918367, -42.244898, -38.571429, -34.897959, -31.22449 , -27.55102 ,
       -23.877551, -20.204082, -16.530612, -12.857143,  -9.183673,  -5.510204,
        -1.836735,   1.836735,   5.510204,   9.183673,  12.857143,  16.530612,
        20.204082,  23.877551,  27.55102 ,  31.22449 ,  34.897959,  38.571429,
        42.244898,  45.918367,  49.591837,  53.265306,  56.938776,  60.612245,
        64.285714,  67.959184,  71.632653,  75.306122,  78.979592,  82.653061,
        86.326531,  90.      ])

lon

(lon)

float64

-180.0 -176.4 ... 172.8 176.4

array([-180. , -176.4, -172.8, -169.2, -165.6, -162. , -158.4, -154.8, -151.2,
       -147.6, -144. , -140.4, -136.8, -133.2, -129.6, -126. , -122.4, -118.8,
       -115.2, -111.6, -108. , -104.4, -100.8,  -97.2,  -93.6,  -90. ,  -86.4,
        -82.8,  -79.2,  -75.6,  -72. ,  -68.4,  -64.8,  -61.2,  -57.6,  -54. ,
        -50.4,  -46.8,  -43.2,  -39.6,  -36. ,  -32.4,  -28.8,  -25.2,  -21.6,
        -18. ,  -14.4,  -10.8,   -7.2,   -3.6,    0. ,    3.6,    7.2,   10.8,
         14.4,   18. ,   21.6,   25.2,   28.8,   32.4,   36. ,   39.6,   43.2,
         46.8,   50.4,   54. ,   57.6,   61.2,   64.8,   68.4,   72. ,   75.6,
         79.2,   82.8,   86.4,   90. ,   93.6,   97.2,  100.8,  104.4,  108. ,
        111.6,  115.2,  118.8,  122.4,  126. ,  129.6,  133.2,  136.8,  140.4,
        144. ,  147.6,  151.2,  154.8,  158.4,  162. ,  165.6,  169.2,  172.8,
        176.4])

Attributes: (0)

The Xarray way is to use xarray.DataArray.resample, however that is an expensive function to run - we started with 56 tasks and 56 chunks in the Dask graph, and this has exploded to 11,736 tasks and 2920 chunks. For a large dataset this increase in chunk counts really bogs down Dask.

The reason for this is that with resample Xarray will create a new output chunk for each individual day - you can see the chunk size of the output is now (1, 25, 25).

[3]:

da.resample(time='D').mean()

[3]:

<xarray.DataArray 'temperature' (time: 365, lat: 50, lon: 100)>
dask.array<stack, shape=(365, 50, 100), dtype=float64, chunksize=(1, 25, 25), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 2001-01-01 2001-01-02 ... 2001-12-31
  * lat      (lat) float64 -90.0 -86.33 -82.65 -78.98 ... 78.98 82.65 86.33 90.0
  * lon      (lon) float64 -180.0 -176.4 -172.8 -169.2 ... 169.2 172.8 176.4

xarray.DataArray

'temperature'

time: 365
lat: 50
lon: 100

dask.array<chunksize=(1, 25, 25), meta=np.ndarray>

	Array	Chunk
Bytes	13.92 MiB	4.88 kiB
Shape	(365, 50, 100)	(1, 25, 25)
Count	11736 Tasks	2920 Chunks
Type	float64	numpy.ndarray

Coordinates: (3)

time

(time)

datetime64[ns]

2001-01-01 ... 2001-12-31

array(['2001-01-01T00:00:00.000000000', '2001-01-02T00:00:00.000000000',
       '2001-01-03T00:00:00.000000000', ..., '2001-12-29T00:00:00.000000000',
       '2001-12-30T00:00:00.000000000', '2001-12-31T00:00:00.000000000'],
      dtype='datetime64[ns]')

lat

(lat)

float64

-90.0 -86.33 -82.65 ... 86.33 90.0

array([-90.      , -86.326531, -82.653061, -78.979592, -75.306122, -71.632653,
       -67.959184, -64.285714, -60.612245, -56.938776, -53.265306, -49.591837,
       -45.918367, -42.244898, -38.571429, -34.897959, -31.22449 , -27.55102 ,
       -23.877551, -20.204082, -16.530612, -12.857143,  -9.183673,  -5.510204,
        -1.836735,   1.836735,   5.510204,   9.183673,  12.857143,  16.530612,
        20.204082,  23.877551,  27.55102 ,  31.22449 ,  34.897959,  38.571429,
        42.244898,  45.918367,  49.591837,  53.265306,  56.938776,  60.612245,
        64.285714,  67.959184,  71.632653,  75.306122,  78.979592,  82.653061,
        86.326531,  90.      ])

lon

(lon)

float64

-180.0 -176.4 ... 172.8 176.4

array([-180. , -176.4, -172.8, -169.2, -165.6, -162. , -158.4, -154.8, -151.2,
       -147.6, -144. , -140.4, -136.8, -133.2, -129.6, -126. , -122.4, -118.8,
       -115.2, -111.6, -108. , -104.4, -100.8,  -97.2,  -93.6,  -90. ,  -86.4,
        -82.8,  -79.2,  -75.6,  -72. ,  -68.4,  -64.8,  -61.2,  -57.6,  -54. ,
        -50.4,  -46.8,  -43.2,  -39.6,  -36. ,  -32.4,  -28.8,  -25.2,  -21.6,
        -18. ,  -14.4,  -10.8,   -7.2,   -3.6,    0. ,    3.6,    7.2,   10.8,
         14.4,   18. ,   21.6,   25.2,   28.8,   32.4,   36. ,   39.6,   43.2,
         46.8,   50.4,   54. ,   57.6,   61.2,   64.8,   68.4,   72. ,   75.6,
         79.2,   82.8,   86.4,   90. ,   93.6,   97.2,  100.8,  104.4,  108. ,
        111.6,  115.2,  118.8,  122.4,  126. ,  129.6,  133.2,  136.8,  140.4,
        144. ,  147.6,  151.2,  154.8,  158.4,  162. ,  165.6,  169.2,  172.8,
        176.4])

Attributes: (0)

A better way to do this is to use xarray.DataArray.coarsen to do the resampling. This keeps the original number of chunks, but has the drawback that it’s not aware of the time axis, you need to specify that it should be reduced by 24 samples. It also won’t complain if the time axis is uneven, however for most well-behaved datasets this shouldn’t be an issue.

[4]:

da.coarsen(time=24).mean()

[4]:

<xarray.DataArray 'temperature' (time: 365, lat: 50, lon: 100)>
dask.array<mean_agg-aggregate, shape=(365, 50, 100), dtype=float64, chunksize=(60, 25, 25), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 2001-01-01T11:30:00 ... 2001-12-31T11:30:00
  * lat      (lat) float64 -90.0 -86.33 -82.65 -78.98 ... 78.98 82.65 86.33 90.0
  * lon      (lon) float64 -180.0 -176.4 -172.8 -169.2 ... 169.2 172.8 176.4

xarray.DataArray

'temperature'

time: 365
lat: 50
lon: 100

dask.array<chunksize=(60, 25, 25), meta=np.ndarray>

	Array	Chunk
Bytes	13.92 MiB	292.97 kiB
Shape	(365, 50, 100)	(60, 25, 25)
Count	224 Tasks	56 Chunks
Type	float64	numpy.ndarray

Coordinates: (3)

time

(time)

datetime64[ns]

2001-01-01T11:30:00 ... 2001-12-...

array(['2001-01-01T11:30:00.000000000', '2001-01-02T11:30:00.000000000',
       '2001-01-03T11:30:00.000000000', ..., '2001-12-29T11:30:00.000000000',
       '2001-12-30T11:30:00.000000000', '2001-12-31T11:30:00.000000000'],
      dtype='datetime64[ns]')

lat

(lat)

float64

-90.0 -86.33 -82.65 ... 86.33 90.0

array([-90.      , -86.326531, -82.653061, -78.979592, -75.306122, -71.632653,
       -67.959184, -64.285714, -60.612245, -56.938776, -53.265306, -49.591837,
       -45.918367, -42.244898, -38.571429, -34.897959, -31.22449 , -27.55102 ,
       -23.877551, -20.204082, -16.530612, -12.857143,  -9.183673,  -5.510204,
        -1.836735,   1.836735,   5.510204,   9.183673,  12.857143,  16.530612,
        20.204082,  23.877551,  27.55102 ,  31.22449 ,  34.897959,  38.571429,
        42.244898,  45.918367,  49.591837,  53.265306,  56.938776,  60.612245,
        64.285714,  67.959184,  71.632653,  75.306122,  78.979592,  82.653061,
        86.326531,  90.      ])

lon

(lon)

float64

-180.0 -176.4 ... 172.8 176.4

array([-180. , -176.4, -172.8, -169.2, -165.6, -162. , -158.4, -154.8, -151.2,
       -147.6, -144. , -140.4, -136.8, -133.2, -129.6, -126. , -122.4, -118.8,
       -115.2, -111.6, -108. , -104.4, -100.8,  -97.2,  -93.6,  -90. ,  -86.4,
        -82.8,  -79.2,  -75.6,  -72. ,  -68.4,  -64.8,  -61.2,  -57.6,  -54. ,
        -50.4,  -46.8,  -43.2,  -39.6,  -36. ,  -32.4,  -28.8,  -25.2,  -21.6,
        -18. ,  -14.4,  -10.8,   -7.2,   -3.6,    0. ,    3.6,    7.2,   10.8,
         14.4,   18. ,   21.6,   25.2,   28.8,   32.4,   36. ,   39.6,   43.2,
         46.8,   50.4,   54. ,   57.6,   61.2,   64.8,   68.4,   72. ,   75.6,
         79.2,   82.8,   86.4,   90. ,   93.6,   97.2,  100.8,  104.4,  108. ,
        111.6,  115.2,  118.8,  122.4,  126. ,  129.6,  133.2,  136.8,  140.4,
        144. ,  147.6,  151.2,  154.8,  158.4,  162. ,  165.6,  169.2,  172.8,
        176.4])

Attributes: (0)

climtas.blocked.blocked_resample works the same as coarsen, giving you the same number of chunks as you started with, and it is also time-axis aware - it will check to make sure that the time axis is evenly spaced and you can use Pandas time interval names instead of a sample count.

[5]:

climtas.blocked_resample(da, time='D').mean()

[5]:

<xarray.DataArray 'temperature' (time: 365, lat: 50, lon: 100)>
dask.array<resample_op, shape=(365, 50, 100), dtype=float64, chunksize=(60, 25, 25), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 2001-01-01 2001-01-02 ... 2001-12-31
  * lat      (lat) float64 -90.0 -86.33 -82.65 -78.98 ... 78.98 82.65 86.33 90.0
  * lon      (lon) float64 -180.0 -176.4 -172.8 -169.2 ... 169.2 172.8 176.4

xarray.DataArray

'temperature'

time: 365
lat: 50
lon: 100

dask.array<chunksize=(60, 25, 25), meta=np.ndarray>

	Array	Chunk
Bytes	13.92 MiB	292.97 kiB
Shape	(365, 50, 100)	(60, 25, 25)
Count	112 Tasks	56 Chunks
Type	float64	numpy.ndarray

Coordinates: (3)

time

(time)

datetime64[ns]

2001-01-01 ... 2001-12-31

array(['2001-01-01T00:00:00.000000000', '2001-01-02T00:00:00.000000000',
       '2001-01-03T00:00:00.000000000', ..., '2001-12-29T00:00:00.000000000',
       '2001-12-30T00:00:00.000000000', '2001-12-31T00:00:00.000000000'],
      dtype='datetime64[ns]')

lat

(lat)

float64

-90.0 -86.33 -82.65 ... 86.33 90.0

array([-90.      , -86.326531, -82.653061, -78.979592, -75.306122, -71.632653,
       -67.959184, -64.285714, -60.612245, -56.938776, -53.265306, -49.591837,
       -45.918367, -42.244898, -38.571429, -34.897959, -31.22449 , -27.55102 ,
       -23.877551, -20.204082, -16.530612, -12.857143,  -9.183673,  -5.510204,
        -1.836735,   1.836735,   5.510204,   9.183673,  12.857143,  16.530612,
        20.204082,  23.877551,  27.55102 ,  31.22449 ,  34.897959,  38.571429,
        42.244898,  45.918367,  49.591837,  53.265306,  56.938776,  60.612245,
        64.285714,  67.959184,  71.632653,  75.306122,  78.979592,  82.653061,
        86.326531,  90.      ])

lon

(lon)

float64

-180.0 -176.4 ... 172.8 176.4

array([-180. , -176.4, -172.8, -169.2, -165.6, -162. , -158.4, -154.8, -151.2,
       -147.6, -144. , -140.4, -136.8, -133.2, -129.6, -126. , -122.4, -118.8,
       -115.2, -111.6, -108. , -104.4, -100.8,  -97.2,  -93.6,  -90. ,  -86.4,
        -82.8,  -79.2,  -75.6,  -72. ,  -68.4,  -64.8,  -61.2,  -57.6,  -54. ,
        -50.4,  -46.8,  -43.2,  -39.6,  -36. ,  -32.4,  -28.8,  -25.2,  -21.6,
        -18. ,  -14.4,  -10.8,   -7.2,   -3.6,    0. ,    3.6,    7.2,   10.8,
         14.4,   18. ,   21.6,   25.2,   28.8,   32.4,   36. ,   39.6,   43.2,
         46.8,   50.4,   54. ,   57.6,   61.2,   64.8,   68.4,   72. ,   75.6,
         79.2,   82.8,   86.4,   90. ,   93.6,   97.2,  100.8,  104.4,  108. ,
        111.6,  115.2,  118.8,  122.4,  126. ,  129.6,  133.2,  136.8,  140.4,
        144. ,  147.6,  151.2,  154.8,  158.4,  162. ,  165.6,  169.2,  172.8,
        176.4])

Attributes: (0)

climtas 0.3.2+9.g68ddf31.dirty documentation

Resample

Resample#