Classification of raster timeseries with xr_fresh, geowombat and sklearn

import geowombat as gw
import os
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import tempfile
from pathlib import Path
from xr_fresh.feature_calculator_series import (
    minimum,
    abs_energy,
    mean_abs_change,
    ratio_beyond_r_sigma,
    symmetry_looking,
    sum,
    quantile,
    function_mapping,
)

# make temp directory for outputs
temp_dir = Path(tempfile.mkdtemp())
# Set up error logging
import logging

# set up error logging
logging.basicConfig(
    filename=os.path.join(temp_dir, "error_log.log"),
    level=logging.ERROR,
    format="%(asctime)s:%(levelname)s:%(message)s",
)
# Read in example data
os.chdir("../../xr_fresh/data/")

band_name = "ppt"  # used to rename outputs
file_glob = f"pdsi*tif"
strp_glob = f"pdsi_%Y%m_4500m.tif"

dates = sorted(
    datetime.strptime(string, strp_glob) for string in sorted(glob(file_glob))
)
files = sorted(glob(file_glob))

# print dates and files in a table
pd.DataFrame({"date": dates, "file": files})
/home/mmann1123/miniconda3/envs/xr_fresh_update/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
date file
0 2018-01-01 pdsi_201801_4500m.tif
1 2018-02-01 pdsi_201802_4500m.tif
2 2018-03-01 pdsi_201803_4500m.tif
3 2018-04-01 pdsi_201804_4500m.tif
4 2018-05-01 pdsi_201805_4500m.tif
5 2018-06-01 pdsi_201806_4500m.tif
6 2018-07-01 pdsi_201807_4500m.tif
7 2018-08-01 pdsi_201808_4500m.tif
8 2018-09-01 pdsi_201809_4500m.tif
9 2018-10-01 pdsi_201810_4500m.tif
10 2018-11-01 pdsi_201811_4500m.tif
11 2018-12-01 pdsi_201812_4500m.tif
from xr_fresh.extractors_series import extract_features_series

# Define the feature dictionary
feature_dict = {
    "minimum": [{}],
    "abs_energy": [{}],
    "doy_of_maximum": [{"dates": dates}],
    "mean_abs_change": [{}],
    "ratio_beyond_r_sigma": [{"r": 1}, {"r": 2}],
    "symmetry_looking": [{}],
    "sum": [{}],
    "quantile": [{"q": 0.05}, {"q": 0.95}],
}

# Define the band name and output directory
band_name = "ppt"

# Extract features from the geospatial time series
extract_features_series(
    files, feature_dict, band_name, temp_dir, num_workers=12, nodata=-9999
)
100%|██████████| 4/4 [00:00<00:00, 86.77it/s]
100%|██████████| 4/4 [00:00<00:00, 59.55it/s]
100%|██████████| 4/4 [00:00<00:00, 18.64it/s]
100%|██████████| 4/4 [00:00<00:00, 32.75it/s]
100%|██████████| 4/4 [00:00<00:00, 11.07it/s]
100%|██████████| 4/4 [00:00<00:00, 3359.47it/s]
100%|██████████| 4/4 [00:00<00:00, 13.08it/s]
100%|██████████| 4/4 [00:00<00:00, 7182.03it/s]
100%|██████████| 4/4 [00:00<00:00, 64.93it/s]
100%|██████████| 4/4 [00:00<00:00, 5087.09it/s]
features = sorted(glob(os.path.join(temp_dir, "*.tif")))
feature_names = [os.path.basename(f).split(".")[0] for f in features]

pd.DataFrame({"feature": feature_names, "file": features})
feature file
0 ppt_abs_energy /tmp/tmpupau0mxr/ppt_abs_energy.tif
1 ppt_doy_of_maximum_dates /tmp/tmpupau0mxr/ppt_doy_of_maximum_dates.tif
2 ppt_mean_abs_change /tmp/tmpupau0mxr/ppt_mean_abs_change.tif
3 ppt_minimum /tmp/tmpupau0mxr/ppt_minimum.tif
4 ppt_quantile_q_0 /tmp/tmpupau0mxr/ppt_quantile_q_0.05.tif
5 ppt_quantile_q_0 /tmp/tmpupau0mxr/ppt_quantile_q_0.95.tif
6 ppt_ratio_beyond_r_sigma_r_1 /tmp/tmpupau0mxr/ppt_ratio_beyond_r_sigma_r_1.tif
7 ppt_ratio_beyond_r_sigma_r_2 /tmp/tmpupau0mxr/ppt_ratio_beyond_r_sigma_r_2.tif
8 ppt_sum /tmp/tmpupau0mxr/ppt_sum.tif
9 ppt_symmetry_looking /tmp/tmpupau0mxr/ppt_symmetry_looking.tif
# plot the features in 3x3 grid
fig, axes = plt.subplots(3, 4, figsize=(15, 15))
axes = axes.flatten()

for i, feature in enumerate(features):
    with gw.open(feature) as src:
        src.plot(ax=axes[i], add_colorbar=False)
        axes[i].set_title(feature_names[i])
        # src.plot(robust=True)
        # plt.title(feature_names)
        # plt.show()
_images/2e51731e6ed7c7f18c59ffbc000111e0b6603c3e0a7fba963d894e707bee51f3.png

Create a unsupervised classification of timeseries features

In this example we will create a non-sense classification. The goal is to show how to use xr_fresh to create a timeseries feature dataset and then use geowombat to classify the dataset using sklearn.

Notice the use of SimpleImputer to fill missing values to learn more about this see the pygis.io tutorial

Note: You will need to install some additional features for geowombat by running mamba install geowombat-ml -c conda-forge

from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from geowombat.ml import fit_predict
from sklearn.impute import SimpleImputer  
import numpy as np

cl = Pipeline(
    [
        ("remove_nan", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("clf", KMeans(n_clusters=6, random_state=0)),
    ]
)

# fit the pipeline and plot
with gw.open(features, stack_dim="band", nodata=-9999) as src:
    display(src)
    y = fit_predict(src, cl)
    y.plot(robust=True)
<xarray.DataArray (band: 10, y: 287, x: 371)>
dask.array<concatenate, shape=(10, 287, 371), dtype=float64, chunksize=(1, 256, 256), chunktype=numpy.ndarray>
Coordinates:
  * band     (band) int64 1 1 1 1 1 1 1 1 1 1
  * x        (x) float64 33.01 33.05 33.09 33.13 ... 47.84 47.88 47.92 47.96
  * y        (y) float64 14.98 14.94 14.9 14.86 ... 3.537 3.497 3.456 3.416
Attributes: (12/13)
    transform:           (0.040424187785378464, 0.0, 32.98613723286883, 0.0, ...
    crs:                 4326
    res:                 (0.040424187785378464, 0.040424187785378464)
    is_tiled:            0
    nodatavals:          (-9999,)
    _FillValue:          -9999
    ...                  ...
    offsets:             (0.0,)
    filename:            ['ppt_abs_energy.tif', 'ppt_doy_of_maximum_dates.tif...
    resampling:          nearest
    AREA_OR_POINT:       Area
    _data_are_separate:  1
    _data_are_stacked:   1
_images/7900e2e25f6bb611552cb9e3763689826d87e48e67c387f0aa93a87d9161b285.png
# remove files from temp directory
import shutil
shutil.rmtree(temp_dir)