Dimensionality Reduction
Dimensionality reduction is a technique used in machine learning and statistics to reduce the number of features or variables in a dataset while preserving its essential characteristics. This is particularly useful when dealing with high-dimensional data, as it can help improve model performance, reduce overfitting, and make data visualization easier.
First we will import the required libraries and read in some precipitation imagery.
import geowombat as gw
import os
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import ray
import pandas as pd
os.chdir("../../xr_fresh/data/")
file_glob = f"evi_*tif"
files = sorted(glob(file_glob))
# print dates and files in a table
pd.DataFrame({ "file": files})
/home/mmann1123/miniconda3/envs/xr_fresh_update/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
file | |
---|---|
0 | evi_20160101.tif |
1 | evi_20160401.tif |
2 | evi_20160517.tif |
3 | evi_20170203.tif |
Let’s take a look at the imput data
with gw.open(files, nodata=-9999 ) as ds:
ds = ds.gw.mask_nodata()
ds.plot(col="time", col_wrap=4, cmap="viridis", robust=True)
display(ds)
Now let’s create 3 components from the data
import xr_fresh.dimension_reduction # This registers the accessor
cpus = 8
with ray.init(num_cpus=cpus) as rays:
# Example usage
with gw.open(
files,
stack_dim="band",
nodata=-9999
) as src:
src = src.gw.mask_nodata()
# get 3 k principal components - base zero counting
transformed_dataarray = src.gw_ext.k_pca(
gamma=15, n_components=3, n_workers=cpus, chunk_size=512
)
# plot each with a seperate scale
for i, comp in enumerate(transformed_dataarray.component.values):
transformed_dataarray.sel(component=comp).plot.imshow(
figsize=(8, 6), robust=True, cbar_kwargs={'label': f'Component {i+1}'}
)
plt.title(f'Principal Component {i+1}')
plt.show()
Save your outputs
transformed_dataarray.gw.save(
"test.tif",
overwrite=True,
)