berlin-picnic-api/scripts/osm_local_processor.py

#!/usr/bin/env python3
"""
Process Berlin green spaces from local OSM data file.
Downloads Berlin OSM extract once, then processes locally without API dependencies.
"""

import json
import requests
import asyncio
import xml.etree.ElementTree as ET
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional, Tuple
import sys
import gzip
import math

# Add the app directory to Python path to import services
sys.path.append(str(Path(__file__).parent.parent))

from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService


class LocalOSMProcessor:
    def __init__(self, data_dir: str = "app/data"):
        self.data_dir = Path(data_dir)
        self.raw_dir = self.data_dir / "osm-raw"
        self.processed_dir = self.data_dir / "processed"

        # Create directories
        self.raw_dir.mkdir(parents=True, exist_ok=True)
        self.processed_dir.mkdir(parents=True, exist_ok=True)

        # Initialize existing services
        self.tree_service = StreetTreeService()
        self.berlin_data = BerlinDataService()

        # Berlin bounding box for filtering
        self.berlin_bbox = {
            'min_lat': 52.3370, 'max_lat': 52.6755,
            'min_lon': 13.0882, 'max_lon': 13.7611
        }

    def download_berlin_osm_extract(self):
        """Download Berlin OSM extract from Geofabrik."""
        osm_file = self.raw_dir / "berlin-latest.osm.pbf"

        if osm_file.exists():
            print(f"✅ OSM file already exists: {osm_file}")
            return osm_file

        # Try PBF format first (smaller), fallback to XML
        urls = [
            "https://download.geofabrik.de/europe/germany/berlin-latest.osm.pbf",
            "https://download.geofabrik.de/europe/germany/berlin-latest.osm.bz2"
        ]

        for url in urls:
            try:
                print(f"Downloading Berlin OSM data from {url}")
                print("This is a one-time download (~50MB)...")

                response = requests.get(url, stream=True, timeout=300)
                response.raise_for_status()

                filename = url.split('/')[-1]
                local_file = self.raw_dir / filename

                # Download with progress
                total_size = int(response.headers.get('content-length', 0))
                downloaded = 0

                with open(local_file, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                            downloaded += len(chunk)
                            if total_size > 0:
                                percent = (downloaded / total_size) * 100
                                print(f"\rDownload progress: {percent:.1f}%", end="")

                print(f"\n✅ Downloaded: {local_file}")
                return local_file

            except Exception as e:
                print(f"❌ Failed to download {url}: {e}")
                continue

        raise Exception("Could not download OSM data from any source")

    def download_simple_osm_extract(self):
        """Download simpler XML format if PBF tools not available."""
        osm_file = self.raw_dir / "berlin_green_spaces.osm"

        if osm_file.exists():
            print(f"✅ OSM file already exists: {osm_file}")
            return osm_file

        # Use Overpass API to get a one-time export of green spaces
        print("Downloading Berlin green spaces extract...")

        overpass_url = "http://overpass-api.de/api/interpreter"

        # Query for all green spaces in Berlin (one-time download)
        query = f"""
        [out:xml][timeout:120];
        (
          way["leisure"~"^(park|garden|nature_reserve|recreation_ground|playground|common)$"]
             ({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
          way["landuse"~"^(forest|grass|meadow|recreation_ground|village_green|allotments)$"]
             ({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
          way["natural"~"^(forest|grass|meadow|scrub|heath|wood)$"]
             ({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
        );
        out geom meta;
        """

        try:
            response = requests.post(overpass_url, data=query, timeout=180)
            response.raise_for_status()

            with open(osm_file, 'w', encoding='utf-8') as f:
                f.write(response.text)

            print(f"✅ Downloaded green spaces extract: {osm_file}")
            return osm_file

        except Exception as e:
            print(f"❌ Failed to download OSM extract: {e}")
            raise

    def parse_osm_xml(self, osm_file: Path) -> List[Dict]:
        """Parse OSM XML file to extract green spaces."""
        print(f"Parsing OSM data from {osm_file}...")

        green_spaces = []

        try:
            # Handle different file formats
            if osm_file.suffix == '.gz':
                with gzip.open(osm_file, 'rt', encoding='utf-8') as f:
                    tree = ET.parse(f)
            else:
                tree = ET.parse(osm_file)

            root = tree.getroot()

            # Parse ways (areas)
            ways = root.findall('.//way')
            print(f"Found {len(ways)} ways in OSM data")

            for way in ways:
                try:
                    processed_space = self._process_osm_way(way, root)
                    if processed_space:
                        green_spaces.append(processed_space)
                except Exception as e:
                    continue

            print(f"✅ Extracted {len(green_spaces)} green spaces from OSM data")
            return green_spaces

        except Exception as e:
            print(f"❌ Error parsing OSM file: {e}")
            return []

    def _process_osm_way(self, way, root) -> Optional[Dict]:
        """Process a single OSM way into green space format."""
        # Get tags
        tags = {}
        for tag in way.findall('tag'):
            tags[tag.get('k')] = tag.get('v')

        # Check if it's a green space
        green_space_type = self._get_green_space_type(tags)
        if not green_space_type:
            return None

        # Get node references
        nd_refs = [nd.get('ref') for nd in way.findall('nd')]
        if len(nd_refs) < 3:  # Need at least 3 points for an area
            return None

        # Find node coordinates
        coordinates = []
        for nd_ref in nd_refs:
            node = root.find(f".//node[@id='{nd_ref}']")
            if node is not None:
                lat = float(node.get('lat'))
                lon = float(node.get('lon'))

                # Check if within Berlin bounds
                if (self.berlin_bbox['min_lat'] <= lat <= self.berlin_bbox['max_lat'] and
                    self.berlin_bbox['min_lon'] <= lon <= self.berlin_bbox['max_lon']):
                    coordinates.append((lat, lon))

        if len(coordinates) < 3:
            return None

        # Calculate centroid and area
        centroid_lat, centroid_lon = self._calculate_centroid(coordinates)
        area_sqm = self._calculate_area(coordinates)

        # Skip very small areas
        if area_sqm < 500:
            return None

        # Get name
        name = tags.get('name', f"{green_space_type.title()} near {centroid_lat:.3f}, {centroid_lon:.3f}")

        # Estimate district
        district = self._estimate_district(centroid_lat, centroid_lon)

        return {
            'id': f"osm_way_{way.get('id')}",
            'name': name,
            'fclass': green_space_type,
            'lat': centroid_lat,
            'lng': centroid_lon,
            'area_sqm': int(area_sqm),
            'district': district,
            'osm_tags': tags,
            'osm_id': way.get('id')
        }

    def _get_green_space_type(self, tags: Dict) -> Optional[str]:
        """Determine if tags represent a green space and what type."""
        # Check leisure tags
        leisure = tags.get('leisure', '')
        if leisure in ['park', 'garden', 'nature_reserve', 'recreation_ground',
                      'playground', 'common', 'golf_course']:
            return leisure

        # Check landuse tags
        landuse = tags.get('landuse', '')
        if landuse in ['forest', 'grass', 'meadow', 'recreation_ground',
                      'village_green', 'allotments']:
            return landuse

        # Check natural tags
        natural = tags.get('natural', '')
        if natural in ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood']:
            return natural

        return None

    def _calculate_centroid(self, coordinates: List[Tuple[float, float]]) -> Tuple[float, float]:
        """Calculate centroid of polygon."""
        lat_sum = sum(coord[0] for coord in coordinates)
        lon_sum = sum(coord[1] for coord in coordinates)
        count = len(coordinates)

        return lat_sum / count, lon_sum / count

    def _calculate_area(self, coordinates: List[Tuple[float, float]]) -> float:
        """Calculate area of polygon using shoelace formula."""
        if len(coordinates) < 3:
            return 0

        # Convert to approximate meters for Berlin
        lat_to_m = 111000  # meters per degree latitude
        lon_to_m = 111000 * math.cos(math.radians(52.5))  # adjust for Berlin latitude

        # Convert coordinates to meters
        coords_m = [(lat * lat_to_m, lon * lon_to_m) for lat, lon in coordinates]

        # Shoelace formula
        area = 0
        n = len(coords_m)

        for i in range(n):
            j = (i + 1) % n
            area += coords_m[i][0] * coords_m[j][1]
            area -= coords_m[j][0] * coords_m[i][1]

        return abs(area) / 2

    def _estimate_district(self, lat: float, lng: float) -> str:
        """Rough district estimation from coordinates."""
        # Very rough Berlin district boundaries
        if lat > 52.55:
            return "Pankow" if lng < 13.45 else "Lichtenberg"
        elif lat > 52.52:
            if lng < 13.25:
                return "Charlottenburg-Wilmersdorf"
            elif lng < 13.42:
                return "Mitte"
            else:
                return "Friedrichshain-Kreuzberg"
        elif lat > 52.45:
            if lng < 13.25:
                return "Steglitz-Zehlendorf"
            elif lng < 13.42:
                return "Tempelhof-Schöneberg"
            else:
                return "Neukölln"
        else:
            return "Treptow-Köpenick"

    async def enhance_green_space_with_real_data(self, space_data: Dict):
        """Enhance green space with real tree and toilet data."""
        try:
            lat = space_data['lat']
            lng = space_data['lng']
            area_sqm = space_data['area_sqm']

            print(f"Enhancing {space_data['name']} ({space_data['district']})...")

            # Adaptive radius
            radius = min(350, max(100, int((area_sqm ** 0.5) * 0.7)))

            # Get real data using existing services
            tree_response = await self.tree_service.get_trees_near_location(
                lat, lng, radius_m=radius
            )

            nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 600)

            # Calculate scores
            toilet_score = self._score_toilet_accessibility(nearby_toilets)
            space_type = self._map_to_space_type(space_data.get('fclass', ''))

            enhanced_space = {
                "id": space_data['id'],
                "name": space_data['name'],
                "description": f"Berlin {space_data.get('fclass', 'green space')} from local OSM data",
                "type": space_type,
                "coordinates": {
                    "lat": float(lat),
                    "lng": float(lng)
                },
                "neighborhood": space_data.get('district', 'Unknown'),
                "area_sqm": area_sqm,
                "perimeter_m": int(4 * (area_sqm ** 0.5)),

                # Environmental features from real tree data
                "environmental": {
                    "tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
                    "shade_quality": tree_response.shade_analysis.shade_quality_score,
                    "noise_level": self._estimate_noise_level(space_data),
                    "wildlife_diversity_score": tree_response.metrics.species_diversity_score,
                    "water_features": self._detect_water_features(space_data),
                    "natural_surface_percent": self._estimate_natural_surface(space_data.get('fclass', ''))
                },

                # Real tree metrics
                "tree_data": {
                    "total_trees": tree_response.metrics.total_trees,
                    "trees_per_hectare": tree_response.metrics.trees_per_hectare,
                    "species_count": len(tree_response.metrics.dominant_species),
                    "species_diversity_score": tree_response.metrics.species_diversity_score,
                    "mature_trees_count": tree_response.metrics.mature_trees_count,
                    "young_trees_count": tree_response.metrics.young_trees_count,
                    "average_tree_age": tree_response.metrics.average_tree_age,
                    "average_height": tree_response.metrics.average_height,
                    "average_crown_diameter": tree_response.metrics.average_crown_diameter,
                    "shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
                    "dominant_species": tree_response.metrics.dominant_species[:3]
                },

                # Real toilet accessibility
                "toilet_accessibility": {
                    "nearby_toilets_count": len(nearby_toilets),
                    "accessibility_score": toilet_score,
                    "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
                    "free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
                    "accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
                },

                # Standard features
                "accessibility": {
                    "wheelchair_accessible": True,
                    "public_transport_score": self._estimate_transport_score(space_data.get('district', '')),
                    "cycling_infrastructure": area_sqm > 4000,
                    "parking_availability": 2 if area_sqm > 20000 else 1,
                    "lighting_quality": 3 if 'mitte' in space_data.get('district', '').lower() else 2
                },

                "recreation": {
                    "playground_quality": self._estimate_playground_quality(space_data),
                    "sports_facilities": self._estimate_sports_facilities(space_data),
                    "running_paths": area_sqm > 6000,
                    "cycling_paths": area_sqm > 12000,
                    "dog_friendly": True,
                    "bbq_allowed": self._allows_bbq(space_data)
                },

                # OSM metadata
                "osm_metadata": {
                    "osm_id": space_data.get('osm_id'),
                    "tags": space_data.get('osm_tags', {}),
                    "source": "local_osm_extract"
                },

                "last_updated": datetime.now().isoformat(),
                "data_sources": ["local_osm_extract", "berlin_tree_cadastre", "berlin_toilets"],
                "confidence_score": 92
            }

            trees = tree_response.metrics.total_trees
            toilets = len(nearby_toilets)
            print(f"✅ {space_data['name']}: {trees} trees, {toilets} toilets")

            return enhanced_space

        except Exception as e:
            print(f"❌ Error enhancing {space_data['name']}: {e}")
            return None

    def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
        if not nearby_toilets:
            return 25

        nearest = nearby_toilets[0]['distance_meters']
        if nearest <= 200:
            score = 90
        elif nearest <= 400:
            score = 70
        else:
            score = 50

        # Quality bonuses
        free = len([t for t in nearby_toilets if t.get('is_free', False)])
        accessible = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
        score += min(10, free * 5 + accessible * 3)

        return min(100, score)

    def _map_to_space_type(self, fclass: str) -> str:
        mapping = {
            'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN', 'wood': 'FOREST',
            'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
            'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
            'common': 'PARK', 'village_green': 'GRASS', 'allotments': 'GARDEN'
        }
        return mapping.get(fclass, 'PARK')

    def _detect_water_features(self, space_data: Dict) -> bool:
        name = space_data.get('name', '').lower()
        tags = space_data.get('osm_tags', {})

        water_keywords = ['see', 'teich', 'pond', 'lake', 'bach', 'spree', 'wasser']
        return any(keyword in name for keyword in water_keywords) or 'water' in tags.values()

    def _estimate_noise_level(self, space_data: Dict) -> int:
        fclass = space_data.get('fclass', '')
        district = space_data.get('district', '')

        base = {'forest': 1, 'wood': 1, 'nature_reserve': 1, 'meadow': 2,
               'park': 2, 'garden': 2, 'playground': 3}.get(fclass, 2)

        if any(busy in district.lower() for busy in ['mitte', 'kreuzberg', 'friedrichshain']):
            base += 1

        return min(5, base)

    def _estimate_natural_surface(self, fclass: str) -> int:
        return {'forest': 95, 'wood': 95, 'nature_reserve': 90, 'meadow': 95,
               'grass': 85, 'park': 75, 'garden': 65, 'playground': 40}.get(fclass, 70)

    def _estimate_transport_score(self, district: str) -> int:
        district_lower = district.lower()
        if 'mitte' in district_lower:
            return 5
        elif any(name in district_lower for name in ['charlottenburg', 'kreuzberg', 'friedrichshain']):
            return 4
        else:
            return 3

    def _estimate_playground_quality(self, space_data: Dict) -> int:
        fclass = space_data.get('fclass', '')
        tags = space_data.get('osm_tags', {})

        if fclass == 'playground':
            return 80
        elif 'playground' in tags.values():
            return 75
        elif fclass == 'park':
            return 55
        else:
            return 30

    def _estimate_sports_facilities(self, space_data: Dict) -> bool:
        fclass = space_data.get('fclass', '')
        tags = space_data.get('osm_tags', {})
        name = space_data.get('name', '').lower()

        return (fclass == 'recreation_ground' or
               'sport' in str(tags.values()).lower() or
               any(term in name for term in ['sport', 'football', 'tennis']))

    def _allows_bbq(self, space_data: Dict) -> bool:
        fclass = space_data.get('fclass', '')
        area = space_data.get('area_sqm', 0)
        tags = space_data.get('osm_tags', {})

        # Check explicit BBQ tags
        if tags.get('bbq') == 'yes':
            return True
        elif tags.get('bbq') == 'no':
            return False

        # Default based on type and size
        return fclass in ['park', 'recreation_ground'] and area > 5000

    async def process_all_green_spaces(self):
        """Main processing pipeline."""
        print("🌳 Processing Berlin green spaces from local OSM data...")

        # Step 1: Get OSM data
        try:
            osm_file = self.download_simple_osm_extract()  # More reliable than PBF
        except:
            print("❌ Could not download OSM data")
            return []

        # Step 2: Parse green spaces
        green_spaces = self.parse_osm_xml(osm_file)

        if not green_spaces:
            print("❌ No green spaces found in OSM data")
            return []

        print(f"📊 Found {len(green_spaces)} green spaces to enhance")

        # Step 3: Enhance with real data
        enhanced_spaces = []

        for i, space_data in enumerate(green_spaces, 1):
            print(f"[{i}/{len(green_spaces)}]", end=" ")

            result = await self.enhance_green_space_with_real_data(space_data)
            if result:
                enhanced_spaces.append(result)

            if i % 20 == 0:
                print(f"\n   Progress: {len(enhanced_spaces)} enhanced so far...")

            await asyncio.sleep(0.1)

        print(f"\n✅ Enhanced {len(enhanced_spaces)} spaces with real data!")
        return enhanced_spaces

    def save_enhanced_data(self, enhanced_spaces: List[Dict]):
        """Save the final dataset."""
        output_file = self.processed_dir / "osm_berlin_green_spaces_enhanced.json"

        # Calculate statistics
        with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
        with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
        total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)

        data = {
            "green_spaces": enhanced_spaces,
            "total_count": len(enhanced_spaces),
            "last_updated": datetime.now().isoformat(),
            "data_sources": [
                "local_osm_extract_processed_offline",
                "berlin_tree_cadastre",
                "berlin_toilets"
            ],
            "processing_info": {
                "method": "local_osm_processing_no_api_dependency",
                "includes_all_osm_green_spaces": True,
                "enhanced_with_real_berlin_data": True
            },
            "summary_stats": {
                "total_spaces": len(enhanced_spaces),
                "spaces_with_tree_data": with_trees,
                "spaces_with_toilet_data": with_toilets,
                "total_trees_analyzed": total_trees,
                "tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%",
                "toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%"
            }
        }

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

        print(f"\n🎉 Saved comprehensive dataset: {output_file}")
        print(f"📊 {len(enhanced_spaces)} total green spaces")
        print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data")
        print(f"🌿 {total_trees} total trees analyzed")
        print(f"\n✨ Ready to replace mock data in your API!")

        return output_file


async def main():
    processor = LocalOSMProcessor()

    try:
        print("🚀 Berlin Green Spaces: Local OSM Processing")
        print("=" * 50)
        print("• Downloads OSM data once (no API dependency)")
        print("• Processes locally for all green spaces")
        print("• Enhances with real Berlin tree + toilet data")
        print("=" * 50)

        enhanced_spaces = await processor.process_all_green_spaces()

        if enhanced_spaces:
            processor.save_enhanced_data(enhanced_spaces)

    except KeyboardInterrupt:
        print("\n⚠️ Interrupted")
    except Exception as e:
        print(f"❌ Error: {e}")


if __name__ == "__main__":
    asyncio.run(main())