berlin-picnic-api/scripts/enhance_green_spaces_with_r...

#!/usr/bin/env python3
"""
Enhanced Berlin green space processor using existing tree and toilet services.
Downloads OSM green space boundaries and enhances them with real data using existing services.
"""

import os
import json
import zipfile
import requests
import asyncio
from pathlib import Path
import geopandas as gpd
import pandas as pd
from datetime import datetime
from typing import List, Dict, Optional
import sys

# Add the app directory to Python path to import services
sys.path.append(str(Path(__file__).parent.parent))

from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService


class RealDataGreenSpaceProcessor:
    def __init__(self, data_dir: str = "app/data"):
        self.data_dir = Path(data_dir)
        self.raw_dir = self.data_dir / "geo-raw"
        self.processed_dir = self.data_dir / "processed"

        # Create directories
        self.raw_dir.mkdir(parents=True, exist_ok=True)
        self.processed_dir.mkdir(parents=True, exist_ok=True)

        # Initialize existing services
        self.tree_service = StreetTreeService()
        self.berlin_data = BerlinDataService()

    def download_berlin_districts(self):
        """Download Berlin district boundaries."""
        json_file = self.raw_dir / "bezirksgrenzen.geojson"

        if json_file.exists():
            print(f"Berlin district data already exists: {json_file}")
            return json_file

        link = "https://tsb-opendata.s3.eu-central-1.amazonaws.com/bezirksgrenzen/bezirksgrenzen.geojson"
        print(f"Downloading Berlin district data from {link}")

        try:
            response = requests.get(link, timeout=30)
            response.raise_for_status()

            with open(json_file, 'wb') as f:
                f.write(response.content)

            print(f"Downloaded to {json_file}")
            return json_file
        except Exception as e:
            print(f"Error downloading districts: {e}")
            raise

    def download_osm_data(self):
        """Download Berlin OpenStreetMap data."""
        zip_file = self.raw_dir / "berlin_shapes.zip"
        shp_dir = self.raw_dir / "berlin_shapes"

        # Check if already extracted
        required_files = ["gis_osm_landuse_a_free_1.shp", "gis_osm_natural_a_free_1.shp", "gis_osm_leisure_a_free_1.shp"]
        if all((shp_dir / f).exists() for f in required_files):
            print(f"Berlin OSM data already exists: {shp_dir}")
            return shp_dir

        if not zip_file.exists():
            link = "https://download.geofabrik.de/europe/germany/berlin-latest-free.shp.zip"
            print(f"Downloading Berlin OSM data from {link} (this may take several minutes...)")

            try:
                response = requests.get(link, stream=True, timeout=300)  # 5 min timeout
                response.raise_for_status()

                with open(zip_file, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)

                print(f"Download completed: {zip_file}")
            except Exception as e:
                print(f"Error downloading OSM data: {e}")
                raise

        print(f"Extracting Berlin OSM data to {shp_dir}")
        try:
            with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                zip_ref.extractall(shp_dir)
            print(f"Extracted to {shp_dir}")
        except Exception as e:
            print(f"Error extracting OSM data: {e}")
            raise

        return shp_dir

    def load_osm_green_spaces(self):
        """Load OSM green space polygons."""
        print("Loading OSM green space boundaries...")

        # Download required data
        districts_file = self.download_berlin_districts()
        shp_dir = self.download_osm_data()

        # Load Berlin districts for clipping
        districts = gpd.read_file(districts_file)

        # Define green space categories we want
        green_categories = {
            'landuse': ['forest', 'grass', 'meadow', 'recreation_ground', 'village_green', 'allotments'],
            'natural': ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood'],
            'leisure': ['park', 'garden', 'nature_reserve', 'playground', 'pitch', 'common', 'golf_course']
        }

        all_green_spaces = []

        # Process each category
        for category, subcategories in green_categories.items():
            shapefile = shp_dir / f"gis_osm_{category}_a_free_1.shp"

            if not shapefile.exists():
                print(f"Warning: {shapefile} not found, skipping")
                continue

            print(f"Processing {category} data...")
            try:
                gdf = gpd.read_file(shapefile)

                # Filter to relevant subcategories
                gdf_filtered = gdf[gdf['fclass'].isin(subcategories)].copy()

                if len(gdf_filtered) == 0:
                    print(f"No {category} features found in subcategories")
                    continue

                # Clip to Berlin boundaries
                gdf_clipped = gpd.clip(gdf_filtered, districts)

                # Calculate area and filter out very small areas (< 1000 sqm)
                gdf_clipped['area_sqm'] = gdf_clipped.geometry.area
                gdf_clipped = gdf_clipped[gdf_clipped['area_sqm'] >= 1000]

                if len(gdf_clipped) > 0:
                    all_green_spaces.append(gdf_clipped)
                    print(f"Found {len(gdf_clipped)} {category} features")

            except Exception as e:
                print(f"Error processing {category}: {e}")
                continue

        if not all_green_spaces:
            raise ValueError("No green space data found")

        # Combine all green spaces
        green_spaces = gpd.GeoDataFrame(pd.concat(all_green_spaces, ignore_index=True))

        # Add district information
        green_spaces = gpd.sjoin(green_spaces, districts[['Bezirk', 'geometry']], how='left')

        # Calculate centroids for analysis
        green_spaces['centroid'] = green_spaces.geometry.centroid
        green_spaces['centroid_lat'] = green_spaces.centroid.y
        green_spaces['centroid_lng'] = green_spaces.centroid.x

        print(f"Total green spaces found: {len(green_spaces)}")
        return green_spaces

    async def enhance_green_space_with_real_data(self, row):
        """Enhance a single green space with real tree and toilet data."""
        try:
            lat = row['centroid_lat']
            lng = row['centroid_lng']
            area_sqm = int(row['area_sqm'])

            # Use existing tree service to get real tree data
            tree_response = await self.tree_service.get_trees_near_location(
                lat, lng, radius_m=min(400, int((area_sqm ** 0.5) * 1.5))  # Adaptive radius
            )

            # Use existing toilet service to get real toilet data
            nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 800)

            # Calculate toilet accessibility score
            toilet_score = self._score_toilet_accessibility(nearby_toilets)

            # Map OSM type to our enum
            space_type = self._map_osm_to_space_type(row.get('fclass', ''))

            # Generate ID
            space_id = f"real_{row.get('fclass', 'unknown')}_{row.name}"

            # Create enhanced green space using real data
            enhanced_space = {
                "id": space_id,
                "name": row.get('name') or f"{row.get('fclass', 'Green Space').title()} in {row.get('Bezirk', 'Berlin')}",
                "description": f"Real Berlin {row.get('fclass', 'green space')} enhanced with tree and toilet data",
                "type": space_type,
                "coordinates": {
                    "lat": float(lat),
                    "lng": float(lng)
                },
                "neighborhood": row.get('Bezirk', 'Unknown'),
                "area_sqm": area_sqm,
                "perimeter_m": int(row.geometry.length) if hasattr(row.geometry, 'length') else 0,

                # Environmental features using real tree data
                "environmental": {
                    "tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
                    "shade_quality": tree_response.shade_analysis.shade_quality_score,
                    "noise_level": self._estimate_noise_level(row.get('fclass', ''), row.get('Bezirk', '')),
                    "wildlife_diversity_score": tree_response.metrics.species_diversity_score,
                    "water_features": 'water' in str(row.get('fclass', '')).lower() or 'river' in str(row.get('name', '')).lower(),
                    "natural_surface_percent": self._estimate_natural_surface(row.get('fclass', ''))
                },

                # Real tree metrics from existing service
                "tree_data": {
                    "total_trees": tree_response.metrics.total_trees,
                    "trees_per_hectare": tree_response.metrics.trees_per_hectare,
                    "species_count": len(tree_response.metrics.dominant_species),
                    "species_diversity_score": tree_response.metrics.species_diversity_score,
                    "mature_trees_count": tree_response.metrics.mature_trees_count,
                    "young_trees_count": tree_response.metrics.young_trees_count,
                    "average_tree_age": tree_response.metrics.average_tree_age,
                    "average_height": tree_response.metrics.average_height,
                    "average_crown_diameter": tree_response.metrics.average_crown_diameter,
                    "shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
                    "dominant_species": tree_response.metrics.dominant_species
                },

                # Real toilet accessibility from existing service
                "toilet_accessibility": {
                    "nearby_toilets_count": len(nearby_toilets),
                    "accessibility_score": toilet_score,
                    "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
                    "free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
                    "accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
                },

                # Standard accessibility features
                "accessibility": {
                    "wheelchair_accessible": True,
                    "public_transport_score": 3,  # Could be enhanced with real transit data
                    "cycling_infrastructure": area_sqm > 5000,
                    "parking_availability": 2,
                    "lighting_quality": 2
                },

                # Recreation features based on OSM data and size
                "recreation": {
                    "playground_quality": self._estimate_playground_quality(row.get('fclass', ''), tree_response.metrics.total_trees),
                    "sports_facilities": 'pitch' in str(row.get('fclass', '')).lower() or 'sport' in str(row.get('name', '')).lower(),
                    "running_paths": area_sqm > 8000,
                    "cycling_paths": area_sqm > 15000,
                    "dog_friendly": True,
                    "bbq_allowed": row.get('fclass') in ['park', 'recreation_ground'] and area_sqm > 5000
                },

                "last_updated": datetime.now().isoformat(),
                "data_sources": ["openstreetmap", "berlin_tree_cadastre", "berlin_toilets"],
                "confidence_score": 95
            }

            return enhanced_space

        except Exception as e:
            print(f"Error enhancing green space {row.name}: {e}")
            return None

    def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
        """Score toilet accessibility using existing toilet data."""
        if not nearby_toilets:
            return 20

        nearest_distance = nearby_toilets[0]['distance_meters']

        # Distance-based scoring
        if nearest_distance <= 200:
            score = 100
        elif nearest_distance <= 400:
            score = 80
        elif nearest_distance <= 600:
            score = 60
        else:
            score = 40

        # Bonuses for quality
        free_toilets = len([t for t in nearby_toilets if t.get('is_free', False)])
        accessible_toilets = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])

        score += min(20, free_toilets * 5 + accessible_toilets * 3)

        return min(100, score)

    def _map_osm_to_space_type(self, fclass: str) -> str:
        """Map OSM feature class to green space types."""
        mapping = {
            'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN',
            'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
            'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
            'wood': 'FOREST', 'heath': 'HEATH', 'pitch': 'SPORTS_AREA',
            'golf_course': 'SPORTS_AREA', 'common': 'PARK', 'village_green': 'GRASS',
            'allotments': 'GARDEN'
        }
        return mapping.get(fclass, 'PARK')

    def _estimate_noise_level(self, fclass: str, district: str) -> int:
        """Estimate noise level (1=very quiet, 5=very noisy)."""
        base_noise = {
            'forest': 1, 'nature_reserve': 1, 'wood': 1,
            'meadow': 2, 'grass': 2, 'heath': 2,
            'park': 2, 'garden': 2, 'common': 2,
            'recreation_ground': 3, 'playground': 3, 'pitch': 3,
            'golf_course': 2, 'allotments': 2
        }

        # Central districts are noisier
        central_districts = ['Mitte', 'Kreuzberg', 'Friedrichshain']
        district_modifier = 1 if district in central_districts else 0

        return min(5, base_noise.get(fclass, 2) + district_modifier)

    def _estimate_natural_surface(self, fclass: str) -> int:
        """Estimate percentage of natural surface."""
        surface_map = {
            'forest': 95, 'nature_reserve': 95, 'wood': 95,
            'meadow': 95, 'grass': 90, 'heath': 90,
            'park': 75, 'garden': 65, 'common': 80,
            'recreation_ground': 60, 'playground': 40, 'pitch': 20,
            'golf_course': 70, 'allotments': 85
        }
        return surface_map.get(fclass, 70)

    def _estimate_playground_quality(self, fclass: str, tree_count: int) -> int:
        """Estimate playground quality score."""
        base_scores = {
            'playground': 85,
            'park': 65,
            'recreation_ground': 70,
            'garden': 40,
            'common': 50
        }

        base = base_scores.get(fclass, 25)

        # Trees improve playground appeal for families
        tree_bonus = min(15, tree_count // 5)  # +3 per 5 trees, max 15

        return min(100, base + tree_bonus)

    async def process_all_green_spaces(self):
        """Process all green spaces with real data enhancement."""
        print("Starting enhanced green space processing with real data...")

        # Load OSM green space boundaries
        osm_green_spaces = self.load_osm_green_spaces()

        enhanced_green_spaces = []

        print(f"Enhancing {len(osm_green_spaces)} green spaces with real tree and toilet data...")

        # Process in batches to avoid overwhelming the system
        batch_size = 50
        total_processed = 0

        for i in range(0, len(osm_green_spaces), batch_size):
            batch = osm_green_spaces.iloc[i:i+batch_size]
            batch_results = []

            for idx, row in batch.iterrows():
                result = await self.enhance_green_space_with_real_data(row)
                if result:
                    batch_results.append(result)

                total_processed += 1
                if total_processed % 25 == 0:
                    print(f"Processed {total_processed}/{len(osm_green_spaces)} green spaces...")

            enhanced_green_spaces.extend(batch_results)

            # Small delay between batches
            await asyncio.sleep(0.1)

        print(f"Successfully enhanced {len(enhanced_green_spaces)} green spaces with real data")
        return enhanced_green_spaces

    def save_enhanced_data(self, enhanced_green_spaces: List[Dict]):
        """Save enhanced green spaces to JSON file."""
        output_file = self.processed_dir / "real_berlin_green_spaces.json"

        # Calculate summary statistics
        spaces_with_trees = len([gs for gs in enhanced_green_spaces if gs["tree_data"]["total_trees"] > 0])
        spaces_with_toilets = len([gs for gs in enhanced_green_spaces if gs["toilet_accessibility"]["nearby_toilets_count"] > 0])
        total_trees = sum(gs["tree_data"]["total_trees"] for gs in enhanced_green_spaces)
        avg_species_per_space = sum(gs["tree_data"]["species_count"] for gs in enhanced_green_spaces) / len(enhanced_green_spaces) if enhanced_green_spaces else 0

        data = {
            "green_spaces": enhanced_green_spaces,
            "total_count": len(enhanced_green_spaces),
            "last_updated": datetime.now().isoformat(),
            "data_sources": [
                "openstreetmap_boundaries",
                "berlin_tree_cadastre_via_service",
                "berlin_toilet_locations_via_service",
                "berlin_districts"
            ],
            "processing_info": {
                "script_version": "1.0",
                "coordinate_system": "WGS84",
                "uses_existing_services": True,
                "tree_analysis_via": "StreetTreeService",
                "toilet_analysis_via": "BerlinDataService"
            },
            "summary_stats": {
                "spaces_with_trees": spaces_with_trees,
                "spaces_with_nearby_toilets": spaces_with_toilets,
                "total_trees_in_all_spaces": total_trees,
                "average_species_per_space": round(avg_species_per_space, 1),
                "coverage_percentage": {
                    "with_tree_data": round((spaces_with_trees / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0,
                    "with_toilet_data": round((spaces_with_toilets / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0
                }
            }
        }

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

        print(f"✅ Saved {len(enhanced_green_spaces)} enhanced green spaces to {output_file}")
        print(f"📊 Summary:")
        print(f"   - {spaces_with_trees} spaces have tree data ({round((spaces_with_trees/len(enhanced_green_spaces))*100, 1)}%)")
        print(f"   - {spaces_with_toilets} spaces have nearby toilets ({round((spaces_with_toilets/len(enhanced_green_spaces))*100, 1)}%)")
        print(f"   - {total_trees} total trees analyzed")
        print(f"   - {avg_species_per_space:.1f} average species per space")

        return output_file


async def main():
    """Main processing function."""
    processor = RealDataGreenSpaceProcessor()

    try:
        # Process enhanced green spaces using existing services
        enhanced_green_spaces = await processor.process_all_green_spaces()

        # Save enhanced data
        output_file = processor.save_enhanced_data(enhanced_green_spaces)

        print(f"\n🎉 Successfully created real data enhanced Berlin green spaces!")
        print(f"📁 Output: {output_file}")

    except KeyboardInterrupt:
        print("\n⚠️  Processing interrupted by user")
    except Exception as e:
        print(f"❌ Error processing data: {e}")
        raise


if __name__ == "__main__":
    asyncio.run(main())