#!/usr/bin/env python3 """ Quick Berlin green spaces processor. Pre-filters OSM data efficiently, then processes only the best candidates. """ import json import asyncio import xml.etree.ElementTree as ET from pathlib import Path from datetime import datetime import sys import re import math # from tqdm.asyncio import tqdm # Not available, remove tqdm dependency from xml.etree.ElementTree import iterparse # Add the app directory to Python path sys.path.append(str(Path(__file__).parent.parent)) from app.services.street_tree_service import StreetTreeService from app.services.berlin_data_service import BerlinDataService def calculate_polygon_area_sqm(coords): """Calculate area of a polygon using the Shoelace formula.""" if len(coords) < 3: return 5000 # Default for invalid polygons # Convert to radians and use spherical approximation for Earth def to_radians(deg): return deg * math.pi / 180 # Use simple planar approximation for small areas # Convert lat/lng to approximate meters (rough approximation for Berlin area) lat_center = sum(lat for lat, lng in coords) / len(coords) lng_center = sum(lng for lat, lng in coords) / len(coords) # Approximate meters per degree at Berlin latitude meters_per_lat = 111320 # roughly constant meters_per_lng = 111320 * math.cos(to_radians(lat_center)) # Convert coordinates to meters relative to center meter_coords = [] for lat, lng in coords: x = (lng - lng_center) * meters_per_lng y = (lat - lat_center) * meters_per_lat meter_coords.append((x, y)) # Shoelace formula area = 0 n = len(meter_coords) for i in range(n): j = (i + 1) % n area += meter_coords[i][0] * meter_coords[j][1] area -= meter_coords[j][0] * meter_coords[i][1] area = abs(area) / 2 # Reasonable bounds check if area < 100: # Too small return 5000 elif area > 10000000: # Too large (10 km²) return 500000 # Cap at reasonable park size return int(area) def calculate_search_radius(area_sqm): """Calculate appropriate tree search radius based on park area.""" if area_sqm < 10000: # < 1 hectare return 150 elif area_sqm < 50000: # < 5 hectares return 300 elif area_sqm < 200000: # < 20 hectares return 500 else: # Large parks like Treptower Park return 800 def calculate_enhanced_shade_quality(tree_response, area_sqm): """Calculate enhanced shade quality based on real tree characteristics.""" metrics = tree_response.metrics shade_analysis = tree_response.shade_analysis # Base score from tree density and coverage base_score = 0 # Factor 1: Actual shade coverage (crown area based) coverage = metrics.shade_coverage_percent or 0 if coverage >= 60: base_score += 40 elif coverage >= 40: base_score += 30 elif coverage >= 20: base_score += 20 elif coverage >= 10: base_score += 10 # Factor 2: Large mature trees (better shade) large_trees = len(shade_analysis.nearby_large_trees or []) if large_trees >= 10: base_score += 25 elif large_trees >= 5: base_score += 20 elif large_trees >= 3: base_score += 15 elif large_trees >= 1: base_score += 10 # Factor 3: Tree density per area trees_per_hectare = metrics.trees_per_hectare or 0 if trees_per_hectare >= 50: base_score += 20 elif trees_per_hectare >= 30: base_score += 15 elif trees_per_hectare >= 20: base_score += 10 elif trees_per_hectare >= 10: base_score += 5 # Factor 4: Average tree height (taller = better shade) avg_height = metrics.average_height or 0 if avg_height >= 20: base_score += 10 elif avg_height >= 15: base_score += 8 elif avg_height >= 10: base_score += 5 elif avg_height >= 5: base_score += 3 # Factor 5: Crown diameter quality avg_crown = metrics.average_crown_diameter or 0 if avg_crown >= 12: base_score += 5 elif avg_crown >= 8: base_score += 3 elif avg_crown >= 5: base_score += 1 return min(100, base_score) def detect_water_features(candidate): """Detect water features using OSM tags and name analysis.""" tags = candidate.get('tags', {}) name = candidate.get('name', '').lower() # Check OSM water-related tags water_tags = ['water', 'waterway', 'natural'] has_water_tags = any( tags.get(tag, '').lower() in ['water', 'lake', 'pond', 'reservoir', 'river', 'stream'] for tag in water_tags ) # Check name for water indicators water_names = ['see', 'teich', 'weiher', 'water', 'lake', 'pond', 'fluss', 'river', 'bach', 'creek'] has_water_name = any(water_word in name for water_word in water_names) # Check for fountain/brunnen fountain_indicators = ['brunnen', 'fountain', 'springbrunnen'] has_fountain = any(fountain in name for fountain in fountain_indicators) return has_water_tags or has_water_name or has_fountain def estimate_berlin_district(lat: float, lng: float) -> str: """Estimate Berlin district from coordinates using geographic boundaries.""" # Northern districts if lat > 52.55: if lng < 13.25: return "Reinickendorf" elif lng < 13.45: return "Pankow" else: return "Lichtenberg" # Central-north districts elif lat > 52.52: if lng < 13.20: return "Spandau" elif lng < 13.30: return "Charlottenburg-Wilmersdorf" elif lng < 13.42: return "Mitte" elif lng < 13.48: return "Friedrichshain-Kreuzberg" else: return "Lichtenberg" # Central districts elif lat > 52.48: if lng < 13.20: return "Spandau" elif lng < 13.30: return "Charlottenburg-Wilmersdorf" elif lng < 13.35: return "Tempelhof-Schöneberg" elif lng < 13.42: return "Mitte" elif lng < 13.48: return "Friedrichshain-Kreuzberg" else: return "Lichtenberg" # Southern-central districts elif lat > 52.45: if lng < 13.20: return "Steglitz-Zehlendorf" elif lng < 13.35: return "Tempelhof-Schöneberg" elif lng < 13.45: return "Neukölln" elif lng < 13.55: return "Treptow-Köpenick" else: return "Marzahn-Hellersdorf" # Southern districts else: if lng < 13.35: return "Steglitz-Zehlendorf" else: return "Treptow-Köpenick" def get_specific_neighborhood(district: str, lat: float, lng: float) -> str: """Get specific neighborhood within district based on coordinates.""" neighborhoods = { "Mitte": { (52.540, 52.560, 13.33, 13.38): "Wedding", (52.515, 52.530, 13.33, 13.38): "Moabit", (52.510, 52.520, 13.35, 13.38): "Tiergarten", (52.525, 52.545, 13.40, 13.43): "Prenzlauer Berg" }, "Charlottenburg-Wilmersdorf": { (52.485, 52.505, 13.30, 13.33): "Wilmersdorf", (52.505, 52.525, 13.25, 13.33): "Charlottenburg" }, "Friedrichshain-Kreuzberg": { (52.490, 52.510, 13.38, 13.42): "Kreuzberg", (52.510, 52.525, 13.42, 13.48): "Friedrichshain" }, "Tempelhof-Schöneberg": { (52.480, 52.500, 13.33, 13.37): "Schöneberg", (52.460, 52.480, 13.37, 13.42): "Tempelhof" }, "Steglitz-Zehlendorf": { (52.430, 52.450, 13.23, 13.30): "Zehlendorf", (52.450, 52.470, 13.30, 13.35): "Steglitz" }, "Treptow-Köpenick": { (52.430, 52.460, 13.55, 13.65): "Köpenick", (52.480, 52.500, 13.45, 13.50): "Treptow" } } if district in neighborhoods: for (min_lat, max_lat, min_lng, max_lng), neighborhood in neighborhoods[district].items(): if min_lat <= lat <= max_lat and min_lng <= lng <= max_lng: return neighborhood return district async def quick_process(): """Quick processing of significant Berlin green spaces.""" print("🚀 Quick Berlin Green Spaces Processor") print("=" * 45) # Initialize services tree_service = StreetTreeService() berlin_data = BerlinDataService() # Pre-load and index trees once to avoid repeated indexing print("🔄 Pre-loading tree data and building spatial index...") await tree_service._load_trees() osm_file = Path("app/data/osm-raw/berlin_green_spaces.osm") if not osm_file.exists(): print("❌ OSM file not found. Please ensure data is downloaded.") return print("🔍 Quick filtering for named parks and significant areas...") print(f"📁 OSM file size: {osm_file.stat().st_size / (1024*1024):.1f} MB") # Quick scan for good candidates candidates = [] try: processed = 0 print("🔍 Single-pass XML parsing - ways with embedded coordinates...") # Single pass: parse ways with embedded coordinates ways_processed = 0 current_way_tags = {} current_way_coordinates = [] in_way = False for event, elem in iterparse(osm_file, events=('start', 'end')): if event == 'start': if elem.tag == 'way': in_way = True current_way_tags = {} current_way_coordinates = [] ways_processed += 1 if ways_processed % 1000 == 0: print(f"Processed {ways_processed} ways, found {len(candidates)} candidates so far...") elif in_way and elem.tag == 'tag': k = elem.get('k') v = elem.get('v') if k and v: current_way_tags[k] = v elif in_way and elem.tag == 'nd': # Extract coordinates directly from nd element lat = elem.get('lat') lon = elem.get('lon') if lat and lon: current_way_coordinates.append((float(lat), float(lon))) continue if elem.tag == 'way' and in_way: in_way = False tags = current_way_tags coordinates = current_way_coordinates # Quick filters for promising spaces - be more lenient has_name = 'name' in tags is_park = (tags.get('leisure') in ['park', 'garden', 'nature_reserve'] or tags.get('landuse') in ['forest', 'grass', 'recreation_ground']) # Also accept common green space tags has_green_tags = any(key in tags for key in ['leisure', 'landuse', 'natural', 'amenity']) if not (has_name or is_park or has_green_tags): elem.clear() # Free memory continue # Use embedded coordinates directly if not coordinates: elem.clear() # Free memory continue # Get center coordinate and all coordinates for area calculation lat, lng = coordinates[0] if len(coordinates) == 1 else ( sum(lat for lat, lng in coordinates) / len(coordinates), sum(lng for lat, lng in coordinates) / len(coordinates) ) # Basic Berlin bounds check if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8): elem.clear() # Free memory continue name = tags.get('name', f"Unnamed {tags.get('leisure', tags.get('landuse', 'area'))}") space_type = tags.get('leisure') or tags.get('landuse') or 'park' candidate = { 'id': f"quick_{elem.get('id')}", 'name': name, 'type': space_type, 'lat': lat, 'lng': lng, 'has_name': has_name, 'tags': tags, 'coordinates': coordinates # Store all coordinates for area calculation } candidates.append(candidate) processed += 1 # Limit for quick processing if len(candidates) >= 100: elem.clear() # Free memory break elem.clear() # Free memory else: elem.clear() # Free memory print(f"✅ Found {len(candidates)} promising green spaces") except Exception as e: print(f"❌ Error in quick filtering: {e}") return if not candidates: print("No candidates found") return # Sort by having names (better quality) candidates.sort(key=lambda x: x['has_name'], reverse=True) print(f"\n🔧 Enhancing top {len(candidates)} spaces with real data...") # Process candidates in parallel with batching batch_size = 10 # Process 10 candidates at a time enhanced_spaces = [] async def process_candidate(candidate): """Process a single candidate with tree and toilet data.""" try: # Calculate actual area from OSM polygon coordinates area_sqm = calculate_polygon_area_sqm(candidate.get('coordinates', [])) search_radius = calculate_search_radius(area_sqm) # Get real tree data and toilet data concurrently with dynamic radius tree_task = tree_service.get_trees_near_location( candidate['lat'], candidate['lng'], radius_m=search_radius ) toilet_task = berlin_data.get_toilets_near_point( candidate['lat'], candidate['lng'], 500 ) print(f"🔍 Getting data for {candidate['name'][:30]}... (area: {area_sqm/10000:.1f}ha, radius: {search_radius}m)") tree_response, nearby_toilets = await asyncio.gather(tree_task, toilet_task) # Create enhanced space enhanced_space = { "id": candidate['id'], "name": candidate['name'], "description": f"Berlin {candidate['type']} discovered via quick OSM processing", "type": "PARK", # Simplified for now "coordinates": { "lat": candidate['lat'], "lng": candidate['lng'] }, "neighborhood": get_specific_neighborhood(estimate_berlin_district(candidate['lat'], candidate['lng']), candidate['lat'], candidate['lng']), "area_sqm": area_sqm, # Real calculated area # Environmental features from real tree data "environmental": { "tree_coverage_percent": max(5, int(tree_response.metrics.shade_coverage_percent)), # Use actual crown area calculation "shade_quality": calculate_enhanced_shade_quality(tree_response, area_sqm), "noise_level": 2, # Default "wildlife_diversity_score": tree_response.metrics.species_diversity_score, "water_features": detect_water_features(candidate), "natural_surface_percent": 80 }, # Real tree data "tree_data": { "total_trees": tree_response.metrics.total_trees, "trees_per_hectare": tree_response.metrics.trees_per_hectare, "species_count": len(tree_response.metrics.dominant_species), "species_diversity_score": tree_response.metrics.species_diversity_score, "mature_trees_count": tree_response.metrics.mature_trees_count, "young_trees_count": tree_response.metrics.young_trees_count, "average_tree_age": tree_response.metrics.average_tree_age, "average_height": tree_response.metrics.average_height, "average_crown_diameter": tree_response.metrics.average_crown_diameter, "shade_coverage_percent": tree_response.metrics.shade_coverage_percent, "dominant_species": tree_response.metrics.dominant_species[:3] }, # Real toilet data "toilet_accessibility": { "nearby_toilets_count": len(nearby_toilets), "accessibility_score": 80 if nearby_toilets else 30, "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None, "free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]), "accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)]) }, # Standard features "accessibility": { "wheelchair_accessible": True, "public_transport_score": 3, "cycling_infrastructure": True, "parking_availability": 2, "lighting_quality": 3 }, "recreation": { "playground_quality": 60 if candidate['type'] == 'park' else 30, "sports_facilities": candidate['type'] == 'recreation_ground', "running_paths": True, "cycling_paths": True, "dog_friendly": True, "bbq_allowed": candidate['type'] in ['park', 'recreation_ground'] }, "osm_metadata": { "has_official_name": candidate['has_name'], "tags": candidate['tags'], "source": "quick_osm_processing" }, "last_updated": datetime.now().isoformat(), "data_sources": ["quick_osm_scan", "berlin_tree_cadastre", "berlin_toilets"], "confidence_score": 90 if candidate['has_name'] else 75 } return enhanced_space, tree_response.metrics.total_trees, len(nearby_toilets) except Exception as e: print(f"❌ Error processing {candidate['name']}: {e}") return None, 0, 0 # Process candidates in batches with progress bar for i in range(0, len(candidates), batch_size): batch = candidates[i:i + batch_size] print(f"Processing batch {i//batch_size + 1}/{(len(candidates) + batch_size - 1)//batch_size}") # Process batch concurrently with progress bar tasks = [process_candidate(candidate) for candidate in batch] results = await asyncio.gather(*tasks) # Collect results for result, trees, toilets in results: if result: enhanced_spaces.append(result) print(f"✅ {result['name'][:40]:40} - {trees:3d} trees, {toilets} toilets") # Small delay between batches to be respectful to APIs if i + batch_size < len(candidates): await asyncio.sleep(0.5) # Save results output_file = Path("app/data/processed/quick_berlin_green_spaces.json") with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0]) with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0]) total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces) data = { "green_spaces": enhanced_spaces, "total_count": len(enhanced_spaces), "last_updated": datetime.now().isoformat(), "data_sources": ["quick_osm_processing", "berlin_tree_cadastre", "berlin_toilets"], "processing_info": { "method": "quick_scan_for_named_and_significant_spaces", "prioritizes_named_spaces": True, "enhanced_with_real_berlin_data": True }, "summary_stats": { "total_spaces": len(enhanced_spaces), "spaces_with_tree_data": with_trees, "spaces_with_toilet_data": with_toilets, "total_trees_analyzed": total_trees, "tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%", "toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%" } } with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"\n🎉 Quick processing complete!") print(f"📁 Saved: {output_file}") print(f"📊 {len(enhanced_spaces)} spaces enhanced") print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data") print(f"🌿 {total_trees} total trees analyzed") print(f"\n✨ Ready to use! This gives you real Berlin green spaces") print(f" with actual tree and toilet data for personality scoring!") if __name__ == "__main__": asyncio.run(quick_process())