import ee

# Trigger the authentication flow.
ee.Authenticate()
ee.Initialize(project = 'ee-fandian24bio')

# Import libraries
import geemap

import geemap.colormaps as cm
import pandas as pd, geopandas as gpd
import numpy as np, matplotlib.pyplot as plt
import os, requests, math, random

from ipyleaflet import TileLayer
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Open data as pandas DataFrame
df = pd.read_csv("Mcabrerae.csv")

df.shape

(2400, 223)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Columns: 223 entries, gbifID to iucnRedListCategory
dtypes: bool(4), float64(105), int64(10), object(104)
memory usage: 4.0+ MB

#Selection of the columns
selection = ["species", "year", "month", "decimalLongitude", "decimalLatitude"]
df = df[selection]
df.describe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   species           2400 non-null   object 
 1   year              1972 non-null   float64
 2   month             1528 non-null   float64
 3   decimalLongitude  2370 non-null   float64
 4   decimalLatitude   2370 non-null   float64
dtypes: float64(4), object(1)
memory usage: 93.9+ KB

df.dropna(inplace=True)

df.shape

(1518, 5)

# Convert DataFrame to GeoDataFrame
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.decimalLongitude,
                                df.decimalLatitude),
    crs="EPSG:4326"
)[["species", "year", "month", "geometry"]]


gdf.head(1)  # Display the first row of the GeoDataFrame

gdf.to_file('file.shp', driver='ESRI Shapefile')
gpd.read_file("file.shp")

gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1518 entries, 270 to 2397
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   species   1518 non-null   object  
 1   year      1518 non-null   float64 
 2   month     1518 non-null   float64 
 3   geometry  1518 non-null   geometry
dtypes: float64(2), geometry(1), object(1)
memory usage: 59.3+ KB

gdf.describe()

#gdf.to_file("microtus_cabrerae.GeoJSON", driver="GeoJSON")
#gdf = gpd.read_file("microtus_cabrerae.GeoJSON")

# Yearly and monthly data distribution heatmap
def plot_heatmap(gdf, h_size= 12):

    statistics = gdf.groupby(["month", "year"]).size().unstack(fill_value=0)

    # Heatmap
    plt.figure(figsize=(h_size, h_size - 6))
    heatmap = plt.imshow(
        statistics.values, cmap="YlOrBr", origin="upper", aspect="auto"
    )

    # Display values above each pixel
    for i in range(len(statistics.index)):
        for j in range(len(statistics.columns)):
            plt.text(
                j, i, statistics.values[i, j], ha="center", va="center", color="black"
            )

    plt.colorbar(heatmap, label="Count")
    plt.title("Monthly Species Count by Year")
    plt.xlabel("Year")
    plt.ylabel("Month")
    plt.xticks(range(len(statistics.columns)), statistics.columns)
    plt.yticks(range(len(statistics.index)), statistics.index)
    plt.tight_layout()
    plt.savefig("heatmap_plot.png")
    plt.show()

plot_heatmap(gdf)

# Filtering data by year (to be coherent with climatic data and lees few information)
filtered_gdf = gdf[
    (~gdf['year'].between(1905, 1995))
]

# Yearly and monthly data distribution heatmap
def plot_heatmap(filtered_gdf, h_size= 12):

    statistics = filtered_gdf.groupby(["month", "year"]).size().unstack(fill_value=0)

    # Heatmap
    plt.figure(figsize=(h_size, h_size - 6))
    heatmap = plt.imshow(
        statistics.values, cmap="YlOrBr", origin="upper", aspect="auto"
    )

    # Display values above each pixel
    for i in range(len(statistics.index)):
        for j in range(len(statistics.columns)):
            plt.text(
                j, i, statistics.values[i, j], ha="center", va="center", color="black"
            )

    plt.colorbar(heatmap, label="Count")
    plt.title("Monthly Species Count by Year")
    plt.xlabel("Year")
    plt.ylabel("Month")
    plt.xticks(range(len(statistics.columns)), statistics.columns)
    plt.yticks(range(len(statistics.index)), statistics.index)
    plt.tight_layout()
    plt.savefig("heatmap_plot.png")
    plt.show()

plot_heatmap(filtered_gdf)

gdf = filtered_gdf

#The duplicate points will be removed later
gdf.duplicated().sum()

911

# Convert GeoDataFrame to Earth Engine object
data_raw = geemap.geopandas_to_ee(gdf)

# Spatial resolution setting (meters)
grain_size = 1000

def remove_duplicates(data, grain_size):
    # Select one occurrence record per pixel at the chosen spatial resolution
    random_raster = ee.Image.random(seed=42).reproject("EPSG:4326", None, grain_size)
    rand_point_vals = random_raster.sampleRegions(
        collection=ee.FeatureCollection(data), geometries=True
    )
    return rand_point_vals.distinct("random")


data = remove_duplicates(data_raw, grain_size)

# Before selection and after selection
print("Original data size:", data_raw.size().getInfo())
print("Final data size:", data.size().getInfo())

Original data size: 1485
Final data size: 300

# Visualization of geographic sampling bias before (blue) and after (red) preprocessing
Map = geemap.Map(layout={"height": "400px", "width": "800px"})

# Add the random raster layer
random_raster = ee.Image.random(seed=42).reproject("EPSG:4326", None, grain_size)
Map.addLayer(
    random_raster,
    {"min": 0, "max": 1, "palette": ["black", "white"], "opacity": 0.5},
    "Random Raster",
)

# Add the original data layer in blue
Map.addLayer(data_raw, {"color": "blue"}, "Original data")

# Add the final data layer in red
Map.addLayer(data, {"color": "red"}, "Final data")

# Set the center of the map to the coordinates
Map.set_center(-3.70256, 40.4165, 5)
Map

Map(center=[40.4165, -3.70256], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchD…

# Define the AOI
aoi = data.geometry().bounds().buffer(distance=300000, maxError=1000)

# Add the AOI to the map
outline = ee.Image().byte().paint(
    featureCollection=aoi, color=1, width=3)

Map.remove_layer("Random Raster")
Map.addLayer(outline, {'palette': 'FF0000'}, "AOI")
Map.centerObject(aoi, 5)
Map

Map(bottom=3340.0, center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position'…

# WorldClim V1 Bioclim
bio = ee.Image("WORLDCLIM/V1/BIO")

# NASA SRTM Digital Elevation 30m
terrain = ee.Algorithms.Terrain(ee.Image("USGS/SRTMGL1_003"))

soil = ee.Image("OpenLandMap/SOL/SOL_GRTGROUP_USDA-SOILTAX_C/v01")

# Global Forest Cover Change (GFCC) Tree Cover Multi-Year Global 30m
tcc = ee.ImageCollection("NASA/MEASURES/GFCC/TC/v3")
median_tcc = (
    tcc.filterDate("2000-01-01", "2015-12-31")
    .select(["tree_canopy_cover"], ["TCC"])
    .median()
)

crop = ee.ImageCollection("JRC/D5/EUCROPMAP/V1").filterDate('2018-01-01', '2019-01-01').first()

pH = ee.Image("OpenLandMap/SOL/SOL_PH-H2O_USDA-4C1A2A_M/v02")

# Combine bands into a multi-band image
predictors = bio.addBands(terrain).addBands(soil).addBands(pH).addBands(crop).addBands(median_tcc)

# Create a water mask
watermask = terrain.select('elevation').gt(0)

# Mask out ocean pixels and clip to the area of interest
predictors = predictors.updateMask(watermask).clip(aoi)

# Generate 5,000 random points
data_cor = predictors.sample(scale=grain_size, numPixels=5000, geometries=True)

# Extract predictor variable values
pvals = predictors.sampleRegions(collection=data_cor, scale=grain_size)

# Converting predictor values from Earth Engine to a DataFrame
pvals_df = geemap.ee_to_df(pvals)
pvals_df.head(1)

pvals_df.shape

(2122, 32)

# Displaying the columns
columns = pvals_df.columns
print(columns)

Index(['TCC', 'aspect', 'b0', 'b10', 'b100', 'b200', 'b30', 'b60', 'bio01',
       'bio02', 'bio03', 'bio04', 'bio05', 'bio06', 'bio07', 'bio08', 'bio09',
       'bio10', 'bio11', 'bio12', 'bio13', 'bio14', 'bio15', 'bio16', 'bio17',
       'bio18', 'bio19', 'classification', 'elevation', 'grtgroup',
       'hillshade', 'slope'],
      dtype='object')

def plot_correlation_heatmap(dataframe, h_size=10, show_labels=False):
    # Calculate Spearman correlation coefficients
    correlation_matrix = dataframe.corr(method="spearman")

    # Create a heatmap
    plt.figure(figsize=(h_size, h_size-2))
    plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')

    # Optionally display values on the heatmap
    if show_labels:
        for i in range(correlation_matrix.shape[0]):
            for j in range(correlation_matrix.shape[1]):
                plt.text(j, i, f"{correlation_matrix.iloc[i, j]:.2f}",
                         ha='center', va='center', color='white', fontsize=8)

    columns = dataframe.columns.tolist()
    plt.xticks(range(len(columns)), columns, rotation=90)
    plt.yticks(range(len(columns)), columns)
    plt.title("Variables Correlation Matrix")
    plt.colorbar(label="Spearman Correlation")
    plt.savefig('correlation_heatmap_plot.png')
    plt.show()

# Plot the correlation heatmap of variables
plot_correlation_heatmap(pvals_df)

# Filter variables based on Variance Inflation Factor (VIF)
def filter_variables_by_vif(dataframe, threshold=10):

    original_columns = dataframe.columns.tolist()
    remaining_columns = original_columns[:]

    while True:
        vif_data = dataframe[remaining_columns]
        vif_values = [
            variance_inflation_factor(vif_data.values, i)
            for i in range(vif_data.shape[1])
        ]

        max_vif_index = vif_values.index(max(vif_values))
        max_vif = max(vif_values)

        if max_vif < threshold:
            break

        print(f"Removing '{remaining_columns[max_vif_index]}' with VIF {max_vif:.2f}")

        del remaining_columns[max_vif_index]

    filtered_data = dataframe[remaining_columns]
    bands = filtered_data.columns.tolist()
    print("Bands:", bands)

    return filtered_data, bands

filtered_pvals_df, bands = filter_variables_by_vif(pvals_df)

Removing 'bio05' with VIF inf
Removing 'bio10' with VIF 156478.74
Removing 'b10' with VIF 50583.87
Removing 'b100' with VIF 29931.65
Removing 'b30' with VIF 24176.46
Removing 'bio01' with VIF 22847.52
Removing 'b60' with VIF 21108.69
Removing 'bio07' with VIF 12967.63
Removing 'b200' with VIF 4767.68
Removing 'bio12' with VIF 3190.19
Removing 'bio16' with VIF 2083.59
Removing 'bio02' with VIF 1520.00
Removing 'bio11' with VIF 682.07
Removing 'bio17' with VIF 536.63
Removing 'hillshade' with VIF 430.01
Removing 'bio13' with VIF 294.89
Removing 'bio03' with VIF 287.12
Removing 'b0' with VIF 180.75
Removing 'bio18' with VIF 135.88
Removing 'bio04' with VIF 112.46
Removing 'bio15' with VIF 65.95
Removing 'classification' with VIF 19.97
Removing 'bio19' with VIF 13.05
Removing 'bio09' with VIF 11.55
Bands: ['TCC', 'aspect', 'bio06', 'bio08', 'bio14', 'elevation', 'grtgroup', 'slope']

# Variable Selection Based on VIF
predictors = predictors.select(bands)

# Plot the correlation heatmap of variables
plot_correlation_heatmap(filtered_pvals_df, h_size=6, show_labels=True)

cm.plot_colormap('terrain', width=8.0, height=0.2, orientation='horizontal')

# Soil types layer (grtgroup)
Map = geemap.Map(layout={'height':'400px', 'width':'800px'})

vis_params = {'bands':['grtgroup'], 'min': 0, 'max': 433, 'palette': [
    'ffffff', 'adff2d', 'adff22', 'a5ff2f', '87ff37', 'baf019',
    '87ff19', '96f03d', 'a3f52f', 'aff319', '91ff37', '9cf319',
    '9bff37', '91ff19', '71ff37', '86ff19', 'a9d42d', 'aff519',
    '9bff19', '9af024', 'a5fd2f', '88ff37', 'afed19', '71ff19',
    'aff026', '8cf537', 'b7ff19', '7177c0', '9a85ec', 'f5f5e1',
    '52cf5a', 'e42777', '4ef76d', 'ff00fb', 'eb05eb', 'fa04fa',
    'fc04f5', 'f50df0', 'f118f1', 'fa0cfa', 'fc05e1', 'f100d5',
    'eb09e6', 'fa22fa', 'ffdab9', 'f5d2bb', 'e8c9b8', 'ffddc4',
    'e7cbc0', 'ffd2c3', 'f5d6bb', 'd5d3b9', 'e8d4b8', 'e7cdc0',
    'f3eac8', 'a0c4ba', 'ffd2b9', 'f5dabb', 'f5d5b9', 'e8ebb8',
    'ffddc2', 'e7ffc0', 'f3e6c8', 'ffdab9', 'f5cdb9', 'a91d30',
    '796578', 'd8ff6e', '177548', '43efd6', '8496a9', '296819',
    '73ffd4', '6fffc8', '75fbc9', '86f5d1', '82ffd2', '88eec8',
    '80ffd4', '6bffc9', '88eec8', '7fffc8', '81ffd2', '86f0d4',
    '67ffc8', '88eec8', '7ffbcb', '87ffd2', '8af5ce', '6bfad2',
    '78f0d4', '88eec8', '7ffbd4', '73f5cd', '88c8d2', '91f0cd',
    '73cdd2', '88eec8', 'fb849b', 'dd4479', '61388b', 'a52a30',
    '722328', 'd81419', 'a42828', '82f5cd', 'a54c2e', 'c11919',
    'b91419', '21b199', '702028', 'b41919', 'b22328', 'a2c7eb',
    '36ba79', '806797', 'cb5b5f', 'cd5c5c', 'd94335', 'd35740',
    'e05a5d', 'cf5b5c', 'ca5964', 'ca5d5f', 'cd5e5a', 'ca5969',
    'd95a35', 'd36240', 'e05c43', 'd64755', 'cf595c', 'ff5f5f',
    'cd6058', 'd95f35', 'd35140', 'd65a55', 'e05c59', 'cf525e',
    'c65978', 'f5615f', '826f9a', 'cff41a', '4a6f31', 'a96989',
    'e16438', '24f640', '88c1f9', 'f5d25c', 'd74322', '7f939e',
    '41a545', '8f8340', '09fe03', '0aff00', '0ff30f', '02f00a',
    '0fc903', '17f000', '0cff00', '0ac814', '0cfe00', '0aff0a',
    '03ff05', '1cf31c', '24f000', '00ff0c', '14c814', '00fe4c',
    '14ff96', '44d205', '05f305', '62f00a', '0fcd03', '00d20f',
    '1add11', '09ff0c', '03ff05', '05e700', '02f00a', '0fea03',
    '00f000', '0ccb0c', '14dd14', '6a685d', 'fae6b9', '769a34',
    '6ff2df', 'ca7fc6', 'd8228f', 'c01bf0', 'd2bad3', 'd8c3cb',
    'd4c6d4', 'd5bed5', 'ddb9dd', 'd8d2d8', 'd4c9d4', 'd2bad5',
    'd5bad5', 'd5b2d5', 'd8c8d2', 'd4cbd4', '552638', '2571eb',
    'ffa514', 'f3a502', 'fb7b00', 'f0b405', 'f7a80f', 'fb9113',
    'ffa519', 'f3a702', 'fbba07', 'f7970f', 'f3a702', 'fb5a00',
    'f0c005', 'f7810f', 'ff9c00', 'f3b002', 'f0b005', 'f7980f',
    '4d7cfc', 'ffff00', 'fafa05', 'ebeb22', 'ffff14', 'f1f10a',
    'fafa05', 'ebeb1e', 'f5eb0c', 'eef506', 'f1f129', 'fafa05',
    'ebeb0c', 'f5d202', 'ffd700', 'f1f12b', 'a91fac', '2da468',
    '9a8b71', '76b989', '713959',
  ]}
Map.addLayer(predictors, vis_params, 'grtgroup')
Map.add_colorbar(vis_params, label="grtgroup", orientation="vertical", layer_name="soil type")
Map.centerObject(aoi, 5)
Map

Map(center=[39.81054025628009, -4.626324141266859], controls=(WidgetControl(options=['position', 'transparent_…

# Elevation layer
Map = geemap.Map(layout={'height':'400px', 'width':'800px'})

vis_params = {'bands':['elevation'], 'min': 0, 'max': 1800, 'palette': cm.palettes.terrain}
Map.addLayer(predictors, vis_params, 'elevation')
Map.add_colorbar(vis_params, label="Elevation (m)", orientation="vertical", layer_name="elevation")
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

# Slope layer
Map = geemap.Map(layout={'height':'400px', 'width':'800px'})

vis_params = {'bands':['slope'], 'min': 0, 'max': 25, 'palette': cm.palettes.RdYlGn_r}
Map.addLayer(predictors, vis_params, 'slope')
Map.add_colorbar(vis_params, label="Slope", orientation="vertical", layer_name="slope")
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

# Aspect layer
Map = geemap.Map(layout={'height':'400px', 'width':'800px'})

vis_params = {'bands':['aspect'], 'min': 0, 'max': 360, 'palette': cm.palettes.rainbow}
Map.addLayer(predictors, vis_params, 'aspect')
Map.add_colorbar(vis_params, label="Aspect", orientation="vertical", layer_name="aspect")
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

# Calculate the minimum and maximum values for bio06
min_max_val = (
    predictors.select("bio06")
    .multiply(0.1)
    .reduceRegion(reducer=ee.Reducer.minMax(), scale=1000)
    .getInfo()
)

# bio09 (Mean temperature of driest quarter) layer
Map = geemap.Map(layout={"height": "400px", "width": "800px"})

vis_params = {
    "min": math.floor(min_max_val["bio06_min"]),
    "max": math.ceil(min_max_val["bio06_max"]),
    "palette": cm.palettes.hot,
}
Map.addLayer(predictors.select("bio06").multiply(0.1), vis_params, "bio06")
Map.add_colorbar(
    vis_params,
    label="Min temperature of coldest month (℃)",
    orientation="vertical",
    layer_name="bio06",
)
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

# Calculate the minimum and maximum values for bio08
min_max_val = (
    predictors.select("bio08")
    .multiply(0.1)
    .reduceRegion(reducer=ee.Reducer.minMax(), scale=1000)
    .getInfo()
)

# bio09 (Mean temperature of driest quarter) layer
Map = geemap.Map(layout={"height": "400px", "width": "800px"})

vis_params = {
    "min": math.floor(min_max_val["bio08_min"]),
    "max": math.ceil(min_max_val["bio08_max"]),
    "palette": cm.palettes.hot,
}
Map.addLayer(predictors.select("bio08").multiply(0.1), vis_params, "bio08")
Map.add_colorbar(
    vis_params,
    label="Mean temperature of wettest quarter (℃)",
    orientation="vertical",
    layer_name="bio08",
)
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

# Calculate the minimum and maximum values for bio14
min_max_val = (
    predictors.select("bio14")
    .reduceRegion(reducer=ee.Reducer.minMax(), scale=1000)
    .getInfo()
)

# bio14 (Precipitation of driest month) layer
Map = geemap.Map(layout={"height": "400px", "width": "800px"})

vis_params = {
    "bands": ["bio14"],
    "min": math.floor(min_max_val["bio14_min"]),
    "max": math.ceil(min_max_val["bio14_max"]),
    "palette": cm.palettes.Blues,
}
Map.addLayer(predictors, vis_params, "bio14")
Map.add_colorbar(
    vis_params,
    label="Precipitation of driest month (mm)",
    orientation="vertical",
    layer_name="bio14",
)
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

# TCC layer
Map = geemap.Map(layout={"height": "400px", "width": "800px"})

vis_params = {
    "bands": ["TCC"],
    "min": 0,
    "max": 100,
    "palette": ["ffffff", "afce56", "5f9c00", "0e6a00", "003800"],
}
Map.addLayer(predictors, vis_params, "TCC")
Map.add_colorbar(
    vis_params, label="Tree Canopy Cover (%)", orientation="vertical", layer_name="TCC"
)
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

# Randomly select 100 locations for occurrence
pvals = predictors.sampleRegions(
    collection=data.randomColumn().sort('random').limit(100),
    properties=[],
    scale=grain_size
)

# Perform k-means clustering
clusterer = ee.Clusterer.wekaKMeans(
    nClusters=2,
    distanceFunction="Euclidean"
).train(pvals)

cl_result = predictors.cluster(clusterer)

# Get cluster ID for locations similar to occurrence
cl_id = cl_result.sampleRegions(
    collection=data.randomColumn().sort('random').limit(200),
    properties=[],
    scale=grain_size
)

# Define non-occurrence areas in dissimilar clusters
cl_id = ee.FeatureCollection(cl_id).reduceColumns(ee.Reducer.mode(),['cluster'])
cl_id = ee.Number(cl_id.get('mode')).subtract(1).abs()
cl_mask = cl_result.select(['cluster']).eq(cl_id)

# Presence location mask
presence_mask = data.reduceToImage(properties=['random'],
reducer=ee.Reducer.first()
).reproject('EPSG:4326', None,
            grain_size).mask().neq(1).selfMask()

# Masking presence locations in non-occurrence areas and clipping to AOI
area_for_pa = presence_mask.updateMask(cl_mask).clip(aoi)

# Area for Pseudo-absence
Map = geemap.Map(layout={'height':'400px', 'width':'800px'})
Map.addLayer(area_for_pa, {'palette': 'black'}, 'AreaForPA')
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

Scale = 50000
grid = watermask.reduceRegions(
    collection=aoi.coveringGrid(scale=Scale, proj='EPSG:4326'),
    reducer=ee.Reducer.mean()).filter(ee.Filter.neq('mean', None))

Map = geemap.Map(layout={'height':'400px', 'width':'800px'})
Map.addLayer(grid, {}, "Grid for spatial block cross validation")
Map.addLayer(outline, {'palette': 'FF0000'}, "Study Area")
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

def sdm(x):
    seed = ee.Number(x)

    # Random block division for training and validation
    rand_blk = ee.FeatureCollection(grid).randomColumn(seed=seed).sort("random")
    training_grid = rand_blk.filter(ee.Filter.lt("random", split))  # Grid for training
    testing_grid = rand_blk.filter(ee.Filter.gte("random", split))  # Grid for testing

    # Presence points
    presence_points = ee.FeatureCollection(data)
    presence_points = presence_points.map(lambda feature: feature.set("PresAbs", 1))
    tr_presence_points = presence_points.filter(
        ee.Filter.bounds(training_grid)
    )  # Presence points for training
    te_presence_points = presence_points.filter(
        ee.Filter.bounds(testing_grid)
    )  # Presence points for testing

    # Pseudo-absence points for training
    tr_pseudo_abs_points = area_for_pa.sample(
        region=training_grid,
        scale=grain_size,
        numPixels=tr_presence_points.size().add(300),
        seed=seed,
        geometries=True,
    )
    # Same number of pseudo-absence points as presence points for training
    tr_pseudo_abs_points = (
        tr_pseudo_abs_points.randomColumn()
        .sort("random")
        .limit(ee.Number(tr_presence_points.size()))
    )
    tr_pseudo_abs_points = tr_pseudo_abs_points.map(lambda feature: feature.set("PresAbs", 0))

    te_pseudo_abs_points = area_for_pa.sample(
        region=testing_grid,
        scale=grain_size,
        numPixels=te_presence_points.size().add(100),
        seed=seed,
        geometries=True,
    )
    # Same number of pseudo-absence points as presence points for testing
    te_pseudo_abs_points = (
        te_pseudo_abs_points.randomColumn()
        .sort("random")
        .limit(ee.Number(te_presence_points.size()))
    )
    te_pseudo_abs_points = te_pseudo_abs_points.map(lambda feature: feature.set("PresAbs", 0))

    # Merge training and pseudo-absence points
    training_partition = tr_presence_points.merge(tr_pseudo_abs_points)
    testing_partition = te_presence_points.merge(te_pseudo_abs_points)

    # Extract predictor variable values at training points
    train_pvals = predictors.sampleRegions(
        collection=training_partition,
        properties=["PresAbs"],
        scale=grain_size,
        geometries=True,
    )

    # Random Forest classifier
    classifier = ee.Classifier.smileRandomForest(
        numberOfTrees=500,
        variablesPerSplit=None,
        minLeafPopulation=10,
        bagFraction=0.5,
        maxNodes=None,
        seed=seed,
    )
    # Presence probability: Habitat suitability map
    classifier_pr = classifier.setOutputMode("PROBABILITY").train(
        train_pvals, "PresAbs", bands
    )
    classified_img_pr = predictors.select(bands).classify(classifier_pr)

    # Binary presence/absence map: Potential distribution map
    classifier_bin = classifier.setOutputMode("CLASSIFICATION").train(
        train_pvals, "PresAbs", bands
    )
    classified_img_bin = predictors.select(bands).classify(classifier_bin)

    return [
        classified_img_pr,
        classified_img_bin,
        training_partition,
        testing_partition,
    ], classifier_pr

split = 0.7
numiter = 10

# Random Seed
#runif = lambda length: [random.randint(1, 1000) for _ in range(length)]
#items = runif(numiter)

# Fixed seed
items = [287, 288, 553, 226, 151, 255, 902, 267, 419, 538]

results_list = [] # Initialize SDM results list
importances_list = [] # Initialize variable importance list

for item in items:
    result, trained = sdm(item)
    # Accumulate SDM results into the list
    results_list.extend(result)

    # Accumulate variable importance into the list
    importance = ee.Dictionary(trained.explain()).get('importance')
    importances_list.extend(importance.getInfo().items())

# Flatten the SDM results list
results = ee.List(results_list).flatten()

# Habitat suitability map
images = ee.List.sequence(
    0, ee.Number(numiter).multiply(4).subtract(1), 4).map(
    lambda x: results.get(x))
model_average = ee.ImageCollection.fromImages(images).mean()

Map = geemap.Map(layout={'height':'400px', 'width':'800px'}, basemap='Esri.WorldImagery')

vis_params = {
    'min': 0,
    'max': 1,
    'palette': cm.palettes.viridis_r}
Map.addLayer(model_average, vis_params, 'Habitat suitability')
Map.add_colorbar(vis_params, label="Habitat suitability",
                 orientation="horizontal",
                 layer_name="Habitat suitability")
Map.addLayer(data, {'color':'red'}, 'Presence')
Map.centerObject(aoi, 5)
Map

Map(center=[39.8105402562801, -4.6263241412668625], controls=(WidgetControl(options=['position', 'transparent_…

# Potential distribution map
images2 = ee.List.sequence(1, ee.Number(numiter).multiply(4).subtract(1), 4).map(
    lambda x: results.get(x)
)
distribution_map = ee.ImageCollection.fromImages(images2).mode()

Map = geemap.Map(
    layout={"height": "400px", "width": "800px"}, basemap="Esri.WorldImagery"
)

vis_params = {"min": 0, "max": 1, "palette": ["white", "green"]}
Map.addLayer(distribution_map, vis_params, "Potential distribution")
Map.addLayer(data, {"color": "red"}, "Presence")
Map.add_colorbar(
    vis_params,
    label="Potential distribution",
    discrete=True,
    orientation="horizontal",
    layer_name="Potential distribution",
)
Map.centerObject(data.geometry(), 5)
Map

Map(center=[39.86745738149589, -3.494492174171101], controls=(WidgetControl(options=['position', 'transparent_…

def plot_variable_importance(importances_list):
    # Extract each variable importance value into a list
    variables = [item[0] for item in importances_list]
    importances = [item[1] for item in importances_list]

    # Calculate the average importance for each variable
    average_importances = {}
    for variable in set(variables):
        indices = [i for i, var in enumerate(variables) if var == variable]
        average_importance = np.mean([importances[i] for i in indices])
        average_importances[variable] = average_importance

    # Sort the importances in descending order of importance
    sorted_importances = sorted(average_importances.items(),
                                key=lambda x: x[1], reverse=False)
    variables = [item[0] for item in sorted_importances]
    avg_importances = [item[1] for item in sorted_importances]

    # Adjust the graph size
    plt.figure(figsize=(8, 4))

    # Plot the average importance as a horizontal bar chart
    plt.barh(variables, avg_importances)
    plt.xlabel('Importance')
    plt.ylabel('Variables')
    plt.title('Average Variable Importance')

    # Display values above the bars
    for i, v in enumerate(avg_importances):
        plt.text(v + 0.02, i, f"{v:.2f}", va='center')

    # Adjust the x-axis range
    plt.xlim(0, max(avg_importances) + 5)  # Adjust to the desired range

    plt.tight_layout()
    plt.savefig('variable_importance.png')
    plt.show()

plot_variable_importance(importances_list)

def print_pres_abs_sizes(TestingDatasets, numiter):
    # Check and print the sizes of presence and pseudo-absence coordinates
    def get_pres_abs_size(x):
        fc = ee.FeatureCollection(TestingDatasets.get(x))
        presence_size = fc.filter(ee.Filter.eq("PresAbs", 1)).size()
        pseudo_absence_size = fc.filter(ee.Filter.eq("PresAbs", 0)).size()
        return ee.List([presence_size, pseudo_absence_size])

    sizes_info = (
        ee.List.sequence(0, ee.Number(numiter).subtract(1), 1)
        .map(get_pres_abs_size)
        .getInfo()
    )

    for i, sizes in enumerate(sizes_info):
        presence_size = sizes[0]
        pseudo_absence_size = sizes[1]
        print(
            f"Iteration {i + 1}: Presence Size = {presence_size}, Pseudo-absence Size = {pseudo_absence_size}"
        )

# Extracting the Testing Datasets
testing_datasets = ee.List.sequence(
    3, ee.Number(numiter).multiply(4).subtract(1), 4
).map(lambda x: results.get(x))

print_pres_abs_sizes(testing_datasets, numiter)

Iteration 1: Presence Size = 39, Pseudo-absence Size = 29
Iteration 2: Presence Size = 52, Pseudo-absence Size = 40
Iteration 3: Presence Size = 124, Pseudo-absence Size = 69
Iteration 4: Presence Size = 39, Pseudo-absence Size = 39
Iteration 5: Presence Size = 115, Pseudo-absence Size = 62
Iteration 6: Presence Size = 49, Pseudo-absence Size = 35
Iteration 7: Presence Size = 74, Pseudo-absence Size = 42
Iteration 8: Presence Size = 75, Pseudo-absence Size = 36
Iteration 9: Presence Size = 165, Pseudo-absence Size = 86
Iteration 10: Presence Size = 147, Pseudo-absence Size = 71

def get_acc(hsm, t_data, grain_size):
    pr_prob_vals = hsm.sampleRegions(
        collection=t_data, properties=["PresAbs"], scale=grain_size
    )
    seq = ee.List.sequence(start=0, end=1, count=25)  # Divide 0 to 1 into 25 intervals

    def calculate_metrics(cutoff):
        # Each element of the seq list is passed as cutoff(threshold value)

        # Observed present = TP + FN
        pres = pr_prob_vals.filterMetadata("PresAbs", "equals", 1)

        # TP (True Positive)
        tp = ee.Number(
            pres.filterMetadata("classification", "greater_than", cutoff).size()
        )

        # TPR (True Positive Rate) = Recall = Sensitivity = TP / (TP + FN) = TP / Observed present
        tpr = tp.divide(pres.size())

        # Observed absent = FP + TN
        abs = pr_prob_vals.filterMetadata("PresAbs", "equals", 0)

        # FN (False Negative)
        fn = ee.Number(
            pres.filterMetadata("classification", "less_than", cutoff).size()
        )

        # TNR (True Negative Rate) = Specificity = TN  / (FP + TN) = TN / Observed absent
        tn = ee.Number(abs.filterMetadata("classification", "less_than", cutoff).size())
        tnr = tn.divide(abs.size())

        # FP (False Positive)
        fp = ee.Number(
            abs.filterMetadata("classification", "greater_than", cutoff).size()
        )

        # FPR (False Positive Rate) = FP / (FP + TN) = FP / Observed absent
        fpr = fp.divide(abs.size())

        # Precision = TP / (TP + FP) = TP / Predicted present
        precision = tp.divide(tp.add(fp))

        # SUMSS = SUM of Sensitivity and Specificity
        sumss = tpr.add(tnr)

        return ee.Feature(
            None,
            {
                "cutoff": cutoff,
                "TP": tp,
                "TN": tn,
                "FP": fp,
                "FN": fn,
                "TPR": tpr,
                "TNR": tnr,
                "FPR": fpr,
                "Precision": precision,
                "SUMSS": sumss,
            },
        )

    return ee.FeatureCollection(seq.map(calculate_metrics))

def calculate_and_print_auc_metrics(images, testing_datasets, grain_size, numiter):
    # Calculate AUC-ROC and AUC-PR
    def calculate_auc_metrics(x):
        hsm = ee.Image(images.get(x))
        t_data = ee.FeatureCollection(testing_datasets.get(x))
        acc = get_acc(hsm, t_data, grain_size)

        # Calculate AUC-ROC
        x = ee.Array(acc.aggregate_array("FPR"))
        y = ee.Array(acc.aggregate_array("TPR"))
        x1 = x.slice(0, 1).subtract(x.slice(0, 0, -1))
        y1 = y.slice(0, 1).add(y.slice(0, 0, -1))
        auc_roc = x1.multiply(y1).multiply(0.5).reduce("sum", [0]).abs().toList().get(0)

        # Calculate AUC-PR
        x = ee.Array(acc.aggregate_array("TPR"))
        y = ee.Array(acc.aggregate_array("Precision"))
        x1 = x.slice(0, 1).subtract(x.slice(0, 0, -1))
        y1 = y.slice(0, 1).add(y.slice(0, 0, -1))
        auc_pr = x1.multiply(y1).multiply(0.5).reduce("sum", [0]).abs().toList().get(0)

        return (auc_roc, auc_pr)

    auc_metrics = (
        ee.List.sequence(0, ee.Number(numiter).subtract(1), 1)
        .map(calculate_auc_metrics)
        .getInfo()
    )

    # Print AUC-ROC and AUC-PR for each iteration
    df = pd.DataFrame(auc_metrics, columns=["AUC-ROC", "AUC-PR"])
    df.index = [f"Iteration {i + 1}" for i in range(len(df))]
    df.to_csv("auc_metrics.csv", index_label="Iteration")
    print(df)

    # Calculate mean and standard deviation of AUC-ROC and AUC-PR
    mean_auc_roc, std_auc_roc = df["AUC-ROC"].mean(), df["AUC-ROC"].std()
    mean_auc_pr, std_auc_pr = df["AUC-PR"].mean(), df["AUC-PR"].std()
    print(f"Mean AUC-ROC = {mean_auc_roc:.4f} ± {std_auc_roc:.4f}")
    print(f"Mean AUC-PR = {mean_auc_pr:.4f} ± {std_auc_pr:.4f}")

%%time

# Calculate AUC-ROC
calculate_and_print_auc_metrics(images, testing_datasets, grain_size, numiter)

---------------------------------------------------------------------------
HttpError                                 Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/ee/data.py in _execute_cloud_call(call, num_retries)
    405   try:
--> 406     return call.execute(num_retries=num_retries)
    407   except googleapiclient.errors.HttpError as e:

/usr/local/lib/python3.10/dist-packages/googleapiclient/_helpers.py in positional_wrapper(*args, **kwargs)
    129                     logger.warning(message)
--> 130             return wrapped(*args, **kwargs)
    131 

/usr/local/lib/python3.10/dist-packages/googleapiclient/http.py in execute(self, http, num_retries)
    937         if resp.status >= 300:
--> 938             raise HttpError(resp, content, uri=self.uri)
    939         return self.postproc(resp, content)

HttpError: <HttpError 400 when requesting https://earthengine.googleapis.com/v1/projects/ee-fandian24bio/value:compute?prettyPrint=false&alt=json returned "User memory limit exceeded.". Details: "User memory limit exceeded.">

During handling of the above exception, another exception occurred:

EEException                               Traceback (most recent call last)
<timed eval> in <module>

<ipython-input-86-9a93464f0829> in calculate_and_print_auc_metrics(images, testing_datasets, grain_size, numiter)
     25         ee.List.sequence(0, ee.Number(numiter).subtract(1), 1)
     26         .map(calculate_auc_metrics)
---> 27         .getInfo()
     28     )
     29 

/usr/local/lib/python3.10/dist-packages/ee/computedobject.py in getInfo(self)
    105       The object can evaluate to anything.
    106     """
--> 107     return data.computeValue(self)
    108 
    109   def encode(self, encoder: Optional[Callable[..., Any]]) -> Dict[str, Any]:

/usr/local/lib/python3.10/dist-packages/ee/data.py in computeValue(obj)
   1124   _maybe_populate_workload_tag(body)
   1125 
-> 1126   return _execute_cloud_call(
   1127       _get_cloud_projects()
   1128       .value()

/usr/local/lib/python3.10/dist-packages/ee/data.py in _execute_cloud_call(call, num_retries)
    406     return call.execute(num_retries=num_retries)
    407   except googleapiclient.errors.HttpError as e:
--> 408     raise _translate_cloud_exception(e)  # pylint: disable=raise-missing-from
    409 
    410 

EEException: User memory limit exceeded.

	year	month	decimalLongitude	decimalLatitude
count	1972.000000	1528.000000	2370.000000	2370.000000
mean	2010.879817	6.746073	-4.727127	39.775464
std	7.447313	3.149357	2.954253	1.284616
min	1905.000000	1.000000	-8.959004	37.160000
25%	2007.000000	4.000000	-7.305551	38.620983
50%	2012.000000	6.000000	-5.300000	39.860000
75%	2015.000000	10.000000	-1.820000	40.659700
max	2024.000000	12.000000	-0.380000	42.590000

	species	year	month	geometry
0	Microtus cabrerae	2024.0	3.0	POINT (-5.65824 40.92633)
1	Microtus cabrerae	2023.0	11.0	POINT (-1.49000 39.47000)
2	Microtus cabrerae	2023.0	11.0	POINT (-1.24000 39.57000)
3	Microtus cabrerae	2023.0	11.0	POINT (-1.23000 39.75000)
4	Microtus cabrerae	2023.0	10.0	POINT (-1.23000 39.51000)
...	...	...	...	...
1513	Microtus cabrerae	1996.0	9.0	POINT (-6.68000 40.50000)
1514	Microtus cabrerae	1996.0	9.0	POINT (-6.68000 40.50000)
1515	Microtus cabrerae	1996.0	9.0	POINT (-6.68000 40.50000)
1516	Microtus cabrerae	1996.0	9.0	POINT (-6.68000 40.50000)
1517	Microtus cabrerae	1992.0	10.0	POINT (-1.17000 42.43000)

	year	month
count	1518.000000	1518.000000
mean	2011.402503	6.760870
std	7.742863	3.153531
min	1905.000000	1.000000
25%	2010.000000	4.000000
50%	2013.000000	6.000000
75%	2015.000000	10.000000
max	2024.000000	12.000000

Proyecto Final “Curso Aprendizaje Automático aplicado a datos de Biodiversidad”#¶

Species Distrubution Modelling with Google Earth Engine¶