Source code for VectorPreProcessing.gdf_edit

"""
gdf_edit.py
===========

This module provides functions to flag non-contributing areas (NCAs) or lakes and reservoirs in GeoDataFrames
based on intersection thresholds, with customizable options for column names, default values, and initialization values.

Example Usage
-------------
1. Using Shapefiles:
>>> from VectorPreProcessing.gdf_edit import flag_ncaalg_from_files
>>> flagged_gdf = flag_ncaalg_from_files(
...     'path/to/shapefile1.shp', 
...     'path/to/shapefile2.shp', 
...     threshold=0.1, 
...     output_path='output.shp'
... )

>>> flagged_gdf = flag_ncaalg_from_files(
...     'path/to/shapefile1.shp', 
...     'path/to/shapefile2.shp', 
...     threshold=0.1, 
...     output_path='output.shp', 
...     ncontr_col="custom_flag_column",   # Custom column in gdf1 to store flags
...     value_column="NON_ID",             # Column in gdf2 with values to assign
...     initial_value=0,                   # Initial value for gdf1's flag column
...     default_value=5                    # Default value if no value_column specified
... )

2. Using GeoDataFrames Directly:
>>> from VectorPreProcessing.gdf_edit import flag_ncaalg
>>> import geopandas as gpd
>>> gdf1 = gpd.read_file('path/to/shapefile1.shp')
>>> gdf2 = gpd.read_file('path/to/shapefile2.shp')
>>> flagged_gdf = flag_ncaalg(gdf1, gdf2, threshold=0.1)

>>> flagged_gdf = flag_ncaalg(
...     gdf1, 
...     gdf2, 
...     threshold=0.1, 
...     ncontr_col="custom_flag_column",   # Custom column in gdf1 to store flags
...     value_column="NON_ID",             # Column in gdf2 with values to assign
...     initial_value=0,                   # Initial value for gdf1's flag column
...     default_value=5                    # Default value if no value_column specified
... )
"""
import geopandas as gpd
import pandas as pd


[docs]
def flag_ncaalg(
    gdf1: gpd.GeoDataFrame,
    gdf2: gpd.GeoDataFrame,
    threshold: float = 0.1,  # Threshold set to 10% by default
    output_path: str = None,
    ncontr_col: str = "ncontr",  # User-defined column name for flag in gdf1
    value_column: str = None,    # Optional column in gdf2 for dynamic values
    initial_value=None,          # Initial value for the ncontr_col in gdf1
    default_value=2              # Default value for intersections if value_column is None
) -> gpd.GeoDataFrame:
    """
    Flag intersections and optionally assign values from gdf2.

    This function identifies intersections between polygons in gdf1 and gdf2 that meet a specified
    threshold. If an intersection is found, a constant value (default is 2) or a value from a specified
    column in gdf2 (if provided) is assigned to the corresponding row in gdf1. If multiple intersections 
    exist, the first match is used.

    Parameters
    ----------
    gdf1 : gpd.GeoDataFrame
        The primary GeoDataFrame.
    gdf2 : gpd.GeoDataFrame
        The secondary GeoDataFrame with values to assign.
    threshold : float, optional
        The threshold for considering an intersection significant (default is 0.1 or 10%).
    output_path : str, optional
        Path where the modified gdf1 should be saved. If None, the file is not saved.
    ncontr_col : str, optional
        The name of the column to store assigned values in gdf1.
    value_column : str, optional
        The name of the column in gdf2 with values to assign to gdf1. If None, a constant value (default_value) is used.
    initial_value : optional
        The initial value to assign to the ncontr_col column in gdf1 before processing intersections.
    default_value : optional
        The default value to assign to the ncontr_col column if value_column is None (default is 2).

    Returns
    -------
    gpd.GeoDataFrame
        The modified gdf1 with assigned values based on intersections.
    """
    # Initialize the target column with initial_value in gdf1
    gdf1[ncontr_col] = initial_value
    
    # Create spatial index for gdf2
    spatial_index = gdf2.sindex
    
    # Iterate over gdf1 using spatial indexing to find potential intersections
    for index, row in gdf1.iterrows():
        # Use spatial index to find potential intersections
        possible_matches_index = list(spatial_index.query(row['geometry'], predicate='intersects'))
        if not possible_matches_index:
            continue  # No intersections, move to next row
        
        # Filter possible matches for actual intersection
        possible_matches = gdf2.iloc[possible_matches_index]
        actual_intersections = possible_matches[possible_matches.intersects(row['geometry'])]
        
        # Calculate area fractions for actual intersections
        for _, match in actual_intersections.iterrows():
            intersection_area = row['geometry'].intersection(match['geometry']).area
            area_fraction = intersection_area / row['geometry'].area
            if area_fraction > threshold:
                # Assign either a value from gdf2's value_column or the default value
                gdf1.at[index, ncontr_col] = match[value_column] if value_column else default_value
                break  # Use only the first valid intersection to assign the value
                
    # Save the modified gdf1 to a new shapefile if an output path is provided
    if output_path is not None:
        gdf1.to_file(output_path)
    
    return gdf1



[docs]
def flag_ncaalg_from_files(
    shapefile1: str,
    shapefile2: str,
    threshold: float = 0.1,  # Threshold set to 10% by default
    output_path: str = None,
    ncontr_col: str = "ncontr",  # User-defined column name for flag in gdf1
    value_column: str = None,    # Optional column in gdf2 for dynamic values
    initial_value=None,          # Initial value for the ncontr_col in gdf1
    default_value=2              # Default value for intersections if value_column is None
) -> gpd.GeoDataFrame:
    """
    Read two shapefiles, set their CRS to EPSG:4326, and apply the `flag_ncaalg` function.

    Parameters
    ----------
    shapefile1 : str
        Path to the first shapefile.
    shapefile2 : str
        Path to the second shapefile.
    threshold : float, optional
        The threshold for considering an intersection significant, as a fraction of
        the first GeoDataFrame's polygon area (default is 0.1 for 10%).
    output_path : str, optional
        Path where the modified first GeoDataFrame should be saved. If None, the file is not saved.
    ncontr_col : str, optional
        The name of the column to flag intersections in gdf1.
    value_column : str, optional
        The name of the column in gdf2 with values to assign to gdf1.
    initial_value : optional
        The initial value to assign to the ncontr_col column in gdf1 before processing intersections.
    default_value : optional
        The default value to assign to the ncontr_col column if value_column is None (default is 2).

    Returns
    -------
    gpd.GeoDataFrame
        The modified GeoDataFrame of the first GeoDataFrame with the specified column added.
    """
    # Read the shapefiles into GeoDataFrames
    gdf1 = gpd.read_file(shapefile1)
    gdf2 = gpd.read_file(shapefile2)

    # Set the CRS to EPSG:4326 in place
    gdf1.to_crs(epsg=4326, inplace=True)
    gdf2.to_crs(epsg=4326, inplace=True)

    # Call the original flag_ncaalg function with the specified column name, value column, initial value, and default value
    return flag_ncaalg(gdf1, gdf2, threshold, output_path, ncontr_col, value_column, initial_value, default_value)


# Examples:

# 1. Default usage without initial value, constant value assignment (default is 2):
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path)

# 2. Using a value column from gdf2, still with default initialization (None):
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, value_column="NON_ID")

# 3. Default usage with initial value set to 0, constant value assignment:
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, initial_value=0)

# 4. Using a value column from gdf2 with initial value set to 0:
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, value_column="NON_ID", initial_value=0)

# 5. Default usage with custom default value of 3:
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, default_value=3)

# 6. Using a value column with custom default value of 5 if no value column is provided:
# flagged_gdf = flag_ncaalg_from_files(input_basin_path, nctr_test, threshold=0.1, output_path=output_river_path, value_column="NON_ID", default_value=5)