berlin-picnic-api/scripts/quick_green_spaces.py

#!/usr/bin/env python3
"""
Quick Berlin green spaces processor.
Pre-filters OSM data efficiently, then processes only the best candidates.
"""

import json
import asyncio
import xml.etree.ElementTree as ET
from pathlib import Path
from datetime import datetime
import sys
import re
import math
# from tqdm.asyncio import tqdm  # Not available, remove tqdm dependency
from xml.etree.ElementTree import iterparse

# Add the app directory to Python path
sys.path.append(str(Path(__file__).parent.parent))

from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService


def calculate_polygon_area_sqm(coords):
    """Calculate area of a polygon using the Shoelace formula."""
    if len(coords) < 3:
        return 5000  # Default for invalid polygons

    # Convert to radians and use spherical approximation for Earth
    def to_radians(deg):
        return deg * math.pi / 180

    # Use simple planar approximation for small areas
    # Convert lat/lng to approximate meters (rough approximation for Berlin area)
    lat_center = sum(lat for lat, lng in coords) / len(coords)
    lng_center = sum(lng for lat, lng in coords) / len(coords)

    # Approximate meters per degree at Berlin latitude
    meters_per_lat = 111320  # roughly constant
    meters_per_lng = 111320 * math.cos(to_radians(lat_center))

    # Convert coordinates to meters relative to center
    meter_coords = []
    for lat, lng in coords:
        x = (lng - lng_center) * meters_per_lng
        y = (lat - lat_center) * meters_per_lat
        meter_coords.append((x, y))

    # Shoelace formula
    area = 0
    n = len(meter_coords)
    for i in range(n):
        j = (i + 1) % n
        area += meter_coords[i][0] * meter_coords[j][1]
        area -= meter_coords[j][0] * meter_coords[i][1]

    area = abs(area) / 2

    # Reasonable bounds check
    if area < 100:  # Too small
        return 5000
    elif area > 10000000:  # Too large (10 km²)
        return 500000  # Cap at reasonable park size

    return int(area)


def calculate_search_radius(area_sqm):
    """Calculate appropriate tree search radius based on park area."""
    if area_sqm < 10000:  # < 1 hectare
        return 150
    elif area_sqm < 50000:  # < 5 hectares
        return 300
    elif area_sqm < 200000:  # < 20 hectares
        return 500
    else:  # Large parks like Treptower Park
        return 800


def calculate_enhanced_shade_quality(tree_response, area_sqm):
    """Calculate enhanced shade quality based on real tree characteristics."""
    metrics = tree_response.metrics
    shade_analysis = tree_response.shade_analysis

    # Base score from tree density and coverage
    base_score = 0

    # Factor 1: Actual shade coverage (crown area based)
    coverage = metrics.shade_coverage_percent or 0
    if coverage >= 60:
        base_score += 40
    elif coverage >= 40:
        base_score += 30
    elif coverage >= 20:
        base_score += 20
    elif coverage >= 10:
        base_score += 10

    # Factor 2: Large mature trees (better shade)
    large_trees = len(shade_analysis.nearby_large_trees or [])
    if large_trees >= 10:
        base_score += 25
    elif large_trees >= 5:
        base_score += 20
    elif large_trees >= 3:
        base_score += 15
    elif large_trees >= 1:
        base_score += 10

    # Factor 3: Tree density per area
    trees_per_hectare = metrics.trees_per_hectare or 0
    if trees_per_hectare >= 50:
        base_score += 20
    elif trees_per_hectare >= 30:
        base_score += 15
    elif trees_per_hectare >= 20:
        base_score += 10
    elif trees_per_hectare >= 10:
        base_score += 5

    # Factor 4: Average tree height (taller = better shade)
    avg_height = metrics.average_height or 0
    if avg_height >= 20:
        base_score += 10
    elif avg_height >= 15:
        base_score += 8
    elif avg_height >= 10:
        base_score += 5
    elif avg_height >= 5:
        base_score += 3

    # Factor 5: Crown diameter quality
    avg_crown = metrics.average_crown_diameter or 0
    if avg_crown >= 12:
        base_score += 5
    elif avg_crown >= 8:
        base_score += 3
    elif avg_crown >= 5:
        base_score += 1

    return min(100, base_score)


def detect_water_features(candidate):
    """Detect water features using OSM tags and name analysis."""
    tags = candidate.get('tags', {})
    name = candidate.get('name', '').lower()

    # Check OSM water-related tags
    water_tags = ['water', 'waterway', 'natural']
    has_water_tags = any(
        tags.get(tag, '').lower() in ['water', 'lake', 'pond', 'reservoir', 'river', 'stream']
        for tag in water_tags
    )

    # Check name for water indicators
    water_names = ['see', 'teich', 'weiher', 'water', 'lake', 'pond', 'fluss', 'river', 'bach', 'creek']
    has_water_name = any(water_word in name for water_word in water_names)

    # Check for fountain/brunnen
    fountain_indicators = ['brunnen', 'fountain', 'springbrunnen']
    has_fountain = any(fountain in name for fountain in fountain_indicators)

    return has_water_tags or has_water_name or has_fountain


def estimate_berlin_district(lat: float, lng: float) -> str:
    """Estimate Berlin district from coordinates using geographic boundaries."""
    # Northern districts
    if lat > 52.55:
        if lng < 13.25:
            return "Reinickendorf"
        elif lng < 13.45:
            return "Pankow"
        else:
            return "Lichtenberg"
    # Central-north districts
    elif lat > 52.52:
        if lng < 13.20:
            return "Spandau"
        elif lng < 13.30:
            return "Charlottenburg-Wilmersdorf"
        elif lng < 13.42:
            return "Mitte"
        elif lng < 13.48:
            return "Friedrichshain-Kreuzberg"
        else:
            return "Lichtenberg"
    # Central districts
    elif lat > 52.48:
        if lng < 13.20:
            return "Spandau"
        elif lng < 13.30:
            return "Charlottenburg-Wilmersdorf"
        elif lng < 13.35:
            return "Tempelhof-Schöneberg"
        elif lng < 13.42:
            return "Mitte"
        elif lng < 13.48:
            return "Friedrichshain-Kreuzberg"
        else:
            return "Lichtenberg"
    # Southern-central districts
    elif lat > 52.45:
        if lng < 13.20:
            return "Steglitz-Zehlendorf"
        elif lng < 13.35:
            return "Tempelhof-Schöneberg"
        elif lng < 13.45:
            return "Neukölln"
        elif lng < 13.55:
            return "Treptow-Köpenick"
        else:
            return "Marzahn-Hellersdorf"
    # Southern districts
    else:
        if lng < 13.35:
            return "Steglitz-Zehlendorf"
        else:
            return "Treptow-Köpenick"


def get_specific_neighborhood(district: str, lat: float, lng: float) -> str:
    """Get specific neighborhood within district based on coordinates."""
    neighborhoods = {
        "Mitte": {
            (52.540, 52.560, 13.33, 13.38): "Wedding",
            (52.515, 52.530, 13.33, 13.38): "Moabit",
            (52.510, 52.520, 13.35, 13.38): "Tiergarten",
            (52.525, 52.545, 13.40, 13.43): "Prenzlauer Berg"
        },
        "Charlottenburg-Wilmersdorf": {
            (52.485, 52.505, 13.30, 13.33): "Wilmersdorf",
            (52.505, 52.525, 13.25, 13.33): "Charlottenburg"
        },
        "Friedrichshain-Kreuzberg": {
            (52.490, 52.510, 13.38, 13.42): "Kreuzberg",
            (52.510, 52.525, 13.42, 13.48): "Friedrichshain"
        },
        "Tempelhof-Schöneberg": {
            (52.480, 52.500, 13.33, 13.37): "Schöneberg",
            (52.460, 52.480, 13.37, 13.42): "Tempelhof"
        },
        "Steglitz-Zehlendorf": {
            (52.430, 52.450, 13.23, 13.30): "Zehlendorf",
            (52.450, 52.470, 13.30, 13.35): "Steglitz"
        },
        "Treptow-Köpenick": {
            (52.430, 52.460, 13.55, 13.65): "Köpenick",
            (52.480, 52.500, 13.45, 13.50): "Treptow"
        }
    }

    if district in neighborhoods:
        for (min_lat, max_lat, min_lng, max_lng), neighborhood in neighborhoods[district].items():
            if min_lat <= lat <= max_lat and min_lng <= lng <= max_lng:
                return neighborhood

    return district


async def quick_process():
    """Quick processing of significant Berlin green spaces."""
    print("🚀 Quick Berlin Green Spaces Processor")
    print("=" * 45)

    # Initialize services
    tree_service = StreetTreeService()
    berlin_data = BerlinDataService()

    # Pre-load and index trees once to avoid repeated indexing
    print("🔄 Pre-loading tree data and building spatial index...")
    await tree_service._load_trees()

    osm_file = Path("app/data/osm-raw/berlin_green_spaces.osm")

    if not osm_file.exists():
        print("❌ OSM file not found. Please ensure data is downloaded.")
        return

    print("🔍 Quick filtering for named parks and significant areas...")
    print(f"📁 OSM file size: {osm_file.stat().st_size / (1024*1024):.1f} MB")

    # Quick scan for good candidates
    candidates = []

    try:
        processed = 0

        print("🔍 Single-pass XML parsing - ways with embedded coordinates...")

        # Single pass: parse ways with embedded coordinates
        ways_processed = 0
        current_way_tags = {}
        current_way_coordinates = []
        in_way = False

        for event, elem in iterparse(osm_file, events=('start', 'end')):
            if event == 'start':
                if elem.tag == 'way':
                    in_way = True
                    current_way_tags = {}
                    current_way_coordinates = []
                    ways_processed += 1
                    if ways_processed % 1000 == 0:
                        print(f"Processed {ways_processed} ways, found {len(candidates)} candidates so far...")
                elif in_way and elem.tag == 'tag':
                    k = elem.get('k')
                    v = elem.get('v')
                    if k and v:
                        current_way_tags[k] = v
                elif in_way and elem.tag == 'nd':
                    # Extract coordinates directly from nd element
                    lat = elem.get('lat')
                    lon = elem.get('lon')
                    if lat and lon:
                        current_way_coordinates.append((float(lat), float(lon)))
                continue

            if elem.tag == 'way' and in_way:
                in_way = False
                tags = current_way_tags
                coordinates = current_way_coordinates

                # Quick filters for promising spaces - be more lenient
                has_name = 'name' in tags
                is_park = (tags.get('leisure') in ['park', 'garden', 'nature_reserve'] or
                          tags.get('landuse') in ['forest', 'grass', 'recreation_ground'])

                # Also accept common green space tags
                has_green_tags = any(key in tags for key in ['leisure', 'landuse', 'natural', 'amenity'])

                if not (has_name or is_park or has_green_tags):
                    elem.clear()  # Free memory
                    continue

                # Use embedded coordinates directly
                if not coordinates:
                    elem.clear()  # Free memory
                    continue

                # Get center coordinate and all coordinates for area calculation
                lat, lng = coordinates[0] if len(coordinates) == 1 else (
                    sum(lat for lat, lng in coordinates) / len(coordinates),
                    sum(lng for lat, lng in coordinates) / len(coordinates)
                )

                # Basic Berlin bounds check
                if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8):
                    elem.clear()  # Free memory
                    continue

                name = tags.get('name', f"Unnamed {tags.get('leisure', tags.get('landuse', 'area'))}")
                space_type = tags.get('leisure') or tags.get('landuse') or 'park'

                candidate = {
                    'id': f"quick_{elem.get('id')}",
                    'name': name,
                    'type': space_type,
                    'lat': lat,
                    'lng': lng,
                    'has_name': has_name,
                    'tags': tags,
                    'coordinates': coordinates  # Store all coordinates for area calculation
                }

                candidates.append(candidate)
                processed += 1

                # Limit for quick processing
                if len(candidates) >= 100:
                    elem.clear()  # Free memory
                    break

                elem.clear()  # Free memory
            else:
                elem.clear()  # Free memory

        print(f"✅ Found {len(candidates)} promising green spaces")

    except Exception as e:
        print(f"❌ Error in quick filtering: {e}")
        return

    if not candidates:
        print("No candidates found")
        return

    # Sort by having names (better quality)
    candidates.sort(key=lambda x: x['has_name'], reverse=True)

    print(f"\n🔧 Enhancing top {len(candidates)} spaces with real data...")

    # Process candidates in parallel with batching
    batch_size = 10  # Process 10 candidates at a time
    enhanced_spaces = []

    async def process_candidate(candidate):
        """Process a single candidate with tree and toilet data."""
        try:
            # Calculate actual area from OSM polygon coordinates
            area_sqm = calculate_polygon_area_sqm(candidate.get('coordinates', []))
            search_radius = calculate_search_radius(area_sqm)

            # Get real tree data and toilet data concurrently with dynamic radius
            tree_task = tree_service.get_trees_near_location(
                candidate['lat'], candidate['lng'], radius_m=search_radius
            )
            toilet_task = berlin_data.get_toilets_near_point(
                candidate['lat'], candidate['lng'], 500
            )

            print(f"🔍 Getting data for {candidate['name'][:30]}... (area: {area_sqm/10000:.1f}ha, radius: {search_radius}m)")
            tree_response, nearby_toilets = await asyncio.gather(tree_task, toilet_task)

            # Create enhanced space
            enhanced_space = {
                "id": candidate['id'],
                "name": candidate['name'],
                "description": f"Berlin {candidate['type']} discovered via quick OSM processing",
                "type": "PARK",  # Simplified for now
                "coordinates": {
                    "lat": candidate['lat'],
                    "lng": candidate['lng']
                },
                "neighborhood": get_specific_neighborhood(estimate_berlin_district(candidate['lat'], candidate['lng']), candidate['lat'], candidate['lng']),
                "area_sqm": area_sqm,  # Real calculated area

                # Environmental features from real tree data
                "environmental": {
                    "tree_coverage_percent": max(5, int(tree_response.metrics.shade_coverage_percent)),  # Use actual crown area calculation
                    "shade_quality": calculate_enhanced_shade_quality(tree_response, area_sqm),
                    "noise_level": 2,  # Default
                    "wildlife_diversity_score": tree_response.metrics.species_diversity_score,
                    "water_features": detect_water_features(candidate),
                    "natural_surface_percent": 80
                },

                # Real tree data
                "tree_data": {
                    "total_trees": tree_response.metrics.total_trees,
                    "trees_per_hectare": tree_response.metrics.trees_per_hectare,
                    "species_count": len(tree_response.metrics.dominant_species),
                    "species_diversity_score": tree_response.metrics.species_diversity_score,
                    "mature_trees_count": tree_response.metrics.mature_trees_count,
                    "young_trees_count": tree_response.metrics.young_trees_count,
                    "average_tree_age": tree_response.metrics.average_tree_age,
                    "average_height": tree_response.metrics.average_height,
                    "average_crown_diameter": tree_response.metrics.average_crown_diameter,
                    "shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
                    "dominant_species": tree_response.metrics.dominant_species[:3]
                },

                # Real toilet data
                "toilet_accessibility": {
                    "nearby_toilets_count": len(nearby_toilets),
                    "accessibility_score": 80 if nearby_toilets else 30,
                    "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
                    "free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
                    "accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
                },

                # Standard features
                "accessibility": {
                    "wheelchair_accessible": True,
                    "public_transport_score": 3,
                    "cycling_infrastructure": True,
                    "parking_availability": 2,
                    "lighting_quality": 3
                },

                "recreation": {
                    "playground_quality": 60 if candidate['type'] == 'park' else 30,
                    "sports_facilities": candidate['type'] == 'recreation_ground',
                    "running_paths": True,
                    "cycling_paths": True,
                    "dog_friendly": True,
                    "bbq_allowed": candidate['type'] in ['park', 'recreation_ground']
                },

                "osm_metadata": {
                    "has_official_name": candidate['has_name'],
                    "tags": candidate['tags'],
                    "source": "quick_osm_processing"
                },

                "last_updated": datetime.now().isoformat(),
                "data_sources": ["quick_osm_scan", "berlin_tree_cadastre", "berlin_toilets"],
                "confidence_score": 90 if candidate['has_name'] else 75
            }

            return enhanced_space, tree_response.metrics.total_trees, len(nearby_toilets)

        except Exception as e:
            print(f"❌ Error processing {candidate['name']}: {e}")
            return None, 0, 0

    # Process candidates in batches with progress bar
    for i in range(0, len(candidates), batch_size):
        batch = candidates[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(candidates) + batch_size - 1)//batch_size}")

        # Process batch concurrently with progress bar
        tasks = [process_candidate(candidate) for candidate in batch]
        results = await asyncio.gather(*tasks)

        # Collect results
        for result, trees, toilets in results:
            if result:
                enhanced_spaces.append(result)
                print(f"✅ {result['name'][:40]:40} - {trees:3d} trees, {toilets} toilets")

        # Small delay between batches to be respectful to APIs
        if i + batch_size < len(candidates):
            await asyncio.sleep(0.5)

    # Save results
    output_file = Path("app/data/processed/quick_berlin_green_spaces.json")

    with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
    with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
    total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)

    data = {
        "green_spaces": enhanced_spaces,
        "total_count": len(enhanced_spaces),
        "last_updated": datetime.now().isoformat(),
        "data_sources": ["quick_osm_processing", "berlin_tree_cadastre", "berlin_toilets"],
        "processing_info": {
            "method": "quick_scan_for_named_and_significant_spaces",
            "prioritizes_named_spaces": True,
            "enhanced_with_real_berlin_data": True
        },
        "summary_stats": {
            "total_spaces": len(enhanced_spaces),
            "spaces_with_tree_data": with_trees,
            "spaces_with_toilet_data": with_toilets,
            "total_trees_analyzed": total_trees,
            "tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%",
            "toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%"
        }
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"\n🎉 Quick processing complete!")
    print(f"📁 Saved: {output_file}")
    print(f"📊 {len(enhanced_spaces)} spaces enhanced")
    print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data")
    print(f"🌿 {total_trees} total trees analyzed")
    print(f"\n✨ Ready to use! This gives you real Berlin green spaces")
    print(f"   with actual tree and toilet data for personality scoring!")


if __name__ == "__main__":
    asyncio.run(quick_process())