"""
gdf_edit.py
===========
This module provides functions to flag non-contributing areas (NCAs) or lakes and reservoirs in GeoDataFrames
based on intersection thresholds, with customizable options for column names, default values, and initialization values.
Example Usage
-------------
1. Using Shapefiles:
>>> from VectorPreProcessing.gdf_edit import flag_ncaalg_from_files
>>> flagged_gdf = flag_ncaalg_from_files(
... 'path/to/shapefile1.shp',
... 'path/to/shapefile2.shp',
... threshold=0.1,
... output_path='output.shp'
... )
>>> flagged_gdf = flag_ncaalg_from_files(
... 'path/to/shapefile1.shp',
... 'path/to/shapefile2.shp',
... threshold=0.1,
... output_path='output.shp',
... ncontr_col="custom_flag_column", # Custom column in gdf1 to store flags
... value_column="NON_ID", # Column in gdf2 with values to assign
... initial_value=0, # Initial value for gdf1's flag column
... default_value=5 # Default value if no value_column specified
... )
2. Using GeoDataFrames Directly:
>>> from VectorPreProcessing.gdf_edit import flag_ncaalg
>>> import geopandas as gpd
>>> gdf1 = gpd.read_file('path/to/shapefile1.shp')
>>> gdf2 = gpd.read_file('path/to/shapefile2.shp')
>>> flagged_gdf = flag_ncaalg(gdf1, gdf2, threshold=0.1)
>>> flagged_gdf = flag_ncaalg(
... gdf1,
... gdf2,
... threshold=0.1,
... ncontr_col="custom_flag_column", # Custom column in gdf1 to store flags
... value_column="NON_ID", # Column in gdf2 with values to assign
... initial_value=0, # Initial value for gdf1's flag column
... default_value=5 # Default value if no value_column specified
... )
"""
import geopandas as gpd
import pandas as pd
[docs]
def flag_ncaalg(
gdf1: gpd.GeoDataFrame,
gdf2: gpd.GeoDataFrame,
threshold: float = 0.1, # Threshold set to 10% by default
output_path: str = None,
ncontr_col: str = "ncontr", # User-defined column name for flag in gdf1
value_column: str = None, # Optional column in gdf2 for dynamic values
initial_value=None, # Initial value for the ncontr_col in gdf1
default_value=2 # Default value for intersections if value_column is None
) -> gpd.GeoDataFrame:
"""
Flag intersections and optionally assign values from gdf2.
This function identifies intersections between polygons in gdf1 and gdf2 that meet a specified
threshold. If an intersection is found, a constant value (default is 2) or a value from a specified
column in gdf2 (if provided) is assigned to the corresponding row in gdf1. If multiple intersections
exist, the first match is used.
Parameters
----------
gdf1 : gpd.GeoDataFrame
The primary GeoDataFrame.
gdf2 : gpd.GeoDataFrame
The secondary GeoDataFrame with values to assign.
threshold : float, optional
The threshold for considering an intersection significant (default is 0.1 or 10%).
output_path : str, optional
Path where the modified gdf1 should be saved. If None, the file is not saved.
ncontr_col : str, optional
The name of the column to store assigned values in gdf1.
value_column : str, optional
The name of the column in gdf2 with values to assign to gdf1. If None, a constant value (default_value) is used.
initial_value : optional
The initial value to assign to the ncontr_col column in gdf1 before processing intersections.
default_value : optional
The default value to assign to the ncontr_col column if value_column is None (default is 2).
Returns
-------
gpd.GeoDataFrame
The modified gdf1 with assigned values based on intersections.
"""
# Initialize the target column with initial_value in gdf1
gdf1[ncontr_col] = initial_value
# Create spatial index for gdf2
spatial_index = gdf2.sindex
# Iterate over gdf1 using spatial indexing to find potential intersections
for index, row in gdf1.iterrows():
# Use spatial index to find potential intersections
possible_matches_index = list(spatial_index.query(row['geometry'], predicate='intersects'))
if not possible_matches_index:
continue # No intersections, move to next row
# Filter possible matches for actual intersection
possible_matches = gdf2.iloc[possible_matches_index]
actual_intersections = possible_matches[possible_matches.intersects(row['geometry'])]
# Calculate area fractions for actual intersections
for _, match in actual_intersections.iterrows():
intersection_area = row['geometry'].intersection(match['geometry']).area
area_fraction = intersection_area / row['geometry'].area
if area_fraction > threshold:
# Assign either a value from gdf2's value_column or the default value
gdf1.at[index, ncontr_col] = match[value_column] if value_column else default_value
break # Use only the first valid intersection to assign the value
# Save the modified gdf1 to a new shapefile if an output path is provided
if output_path is not None:
gdf1.to_file(output_path)
return gdf1
[docs]
def flag_ncaalg_from_files(
shapefile1: str,
shapefile2: str,
threshold: float = 0.1, # Threshold set to 10% by default
output_path: str = None,
ncontr_col: str = "ncontr", # User-defined column name for flag in gdf1
value_column: str = None, # Optional column in gdf2 for dynamic values
initial_value=None, # Initial value for the ncontr_col in gdf1
default_value=2 # Default value for intersections if value_column is None
) -> gpd.GeoDataFrame:
"""
Read two shapefiles, set their CRS to EPSG:4326, and apply the `flag_ncaalg` function.
Parameters
----------
shapefile1 : str
Path to the first shapefile.
shapefile2 : str
Path to the second shapefile.
threshold : float, optional
The threshold for considering an intersection significant, as a fraction of
the first GeoDataFrame's polygon area (default is 0.1 for 10%).
output_path : str, optional
Path where the modified first GeoDataFrame should be saved. If None, the file is not saved.
ncontr_col : str, optional
The name of the column to flag intersections in gdf1.
value_column : str, optional
The name of the column in gdf2 with values to assign to gdf1.
initial_value : optional
The initial value to assign to the ncontr_col column in gdf1 before processing intersections.
default_value : optional
The default value to assign to the ncontr_col column if value_column is None (default is 2).
Returns
-------
gpd.GeoDataFrame
The modified GeoDataFrame of the first GeoDataFrame with the specified column added.
"""
# Read the shapefiles into GeoDataFrames
gdf1 = gpd.read_file(shapefile1)
gdf2 = gpd.read_file(shapefile2)
# Set the CRS to EPSG:4326 in place
gdf1.to_crs(epsg=4326, inplace=True)
gdf2.to_crs(epsg=4326, inplace=True)
# Call the original flag_ncaalg function with the specified column name, value column, initial value, and default value
return flag_ncaalg(gdf1, gdf2, threshold, output_path, ncontr_col, value_column, initial_value, default_value)
# Examples:
# 1. Default usage without initial value, constant value assignment (default is 2):
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path)
# 2. Using a value column from gdf2, still with default initialization (None):
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, value_column="NON_ID")
# 3. Default usage with initial value set to 0, constant value assignment:
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, initial_value=0)
# 4. Using a value column from gdf2 with initial value set to 0:
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, value_column="NON_ID", initial_value=0)
# 5. Default usage with custom default value of 3:
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, default_value=3)
# 6. Using a value column with custom default value of 5 if no value column is provided:
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, value_column="NON_ID", default_value=5)