berlin-picnic-api/scripts/quick_green_spaces.py

558 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Quick Berlin green spaces processor.
Pre-filters OSM data efficiently, then processes only the best candidates.
"""
import json
import asyncio
import xml.etree.ElementTree as ET
from pathlib import Path
from datetime import datetime
import sys
import re
import math
# from tqdm.asyncio import tqdm # Not available, remove tqdm dependency
from xml.etree.ElementTree import iterparse
# Add the app directory to Python path
sys.path.append(str(Path(__file__).parent.parent))
from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService
def calculate_polygon_area_sqm(coords):
"""Calculate area of a polygon using the Shoelace formula."""
if len(coords) < 3:
return 5000 # Default for invalid polygons
# Convert to radians and use spherical approximation for Earth
def to_radians(deg):
return deg * math.pi / 180
# Use simple planar approximation for small areas
# Convert lat/lng to approximate meters (rough approximation for Berlin area)
lat_center = sum(lat for lat, lng in coords) / len(coords)
lng_center = sum(lng for lat, lng in coords) / len(coords)
# Approximate meters per degree at Berlin latitude
meters_per_lat = 111320 # roughly constant
meters_per_lng = 111320 * math.cos(to_radians(lat_center))
# Convert coordinates to meters relative to center
meter_coords = []
for lat, lng in coords:
x = (lng - lng_center) * meters_per_lng
y = (lat - lat_center) * meters_per_lat
meter_coords.append((x, y))
# Shoelace formula
area = 0
n = len(meter_coords)
for i in range(n):
j = (i + 1) % n
area += meter_coords[i][0] * meter_coords[j][1]
area -= meter_coords[j][0] * meter_coords[i][1]
area = abs(area) / 2
# Reasonable bounds check
if area < 100: # Too small
return 5000
elif area > 10000000: # Too large (10 km²)
return 500000 # Cap at reasonable park size
return int(area)
def calculate_search_radius(area_sqm):
"""Calculate appropriate tree search radius based on park area."""
if area_sqm < 10000: # < 1 hectare
return 150
elif area_sqm < 50000: # < 5 hectares
return 300
elif area_sqm < 200000: # < 20 hectares
return 500
else: # Large parks like Treptower Park
return 800
def calculate_enhanced_shade_quality(tree_response, area_sqm):
"""Calculate enhanced shade quality based on real tree characteristics."""
metrics = tree_response.metrics
shade_analysis = tree_response.shade_analysis
# Base score from tree density and coverage
base_score = 0
# Factor 1: Actual shade coverage (crown area based)
coverage = metrics.shade_coverage_percent or 0
if coverage >= 60:
base_score += 40
elif coverage >= 40:
base_score += 30
elif coverage >= 20:
base_score += 20
elif coverage >= 10:
base_score += 10
# Factor 2: Large mature trees (better shade)
large_trees = len(shade_analysis.nearby_large_trees or [])
if large_trees >= 10:
base_score += 25
elif large_trees >= 5:
base_score += 20
elif large_trees >= 3:
base_score += 15
elif large_trees >= 1:
base_score += 10
# Factor 3: Tree density per area
trees_per_hectare = metrics.trees_per_hectare or 0
if trees_per_hectare >= 50:
base_score += 20
elif trees_per_hectare >= 30:
base_score += 15
elif trees_per_hectare >= 20:
base_score += 10
elif trees_per_hectare >= 10:
base_score += 5
# Factor 4: Average tree height (taller = better shade)
avg_height = metrics.average_height or 0
if avg_height >= 20:
base_score += 10
elif avg_height >= 15:
base_score += 8
elif avg_height >= 10:
base_score += 5
elif avg_height >= 5:
base_score += 3
# Factor 5: Crown diameter quality
avg_crown = metrics.average_crown_diameter or 0
if avg_crown >= 12:
base_score += 5
elif avg_crown >= 8:
base_score += 3
elif avg_crown >= 5:
base_score += 1
return min(100, base_score)
def detect_water_features(candidate):
"""Detect water features using OSM tags and name analysis."""
tags = candidate.get('tags', {})
name = candidate.get('name', '').lower()
# Check OSM water-related tags
water_tags = ['water', 'waterway', 'natural']
has_water_tags = any(
tags.get(tag, '').lower() in ['water', 'lake', 'pond', 'reservoir', 'river', 'stream']
for tag in water_tags
)
# Check name for water indicators
water_names = ['see', 'teich', 'weiher', 'water', 'lake', 'pond', 'fluss', 'river', 'bach', 'creek']
has_water_name = any(water_word in name for water_word in water_names)
# Check for fountain/brunnen
fountain_indicators = ['brunnen', 'fountain', 'springbrunnen']
has_fountain = any(fountain in name for fountain in fountain_indicators)
return has_water_tags or has_water_name or has_fountain
def estimate_berlin_district(lat: float, lng: float) -> str:
"""Estimate Berlin district from coordinates using geographic boundaries."""
# Northern districts
if lat > 52.55:
if lng < 13.25:
return "Reinickendorf"
elif lng < 13.45:
return "Pankow"
else:
return "Lichtenberg"
# Central-north districts
elif lat > 52.52:
if lng < 13.20:
return "Spandau"
elif lng < 13.30:
return "Charlottenburg-Wilmersdorf"
elif lng < 13.42:
return "Mitte"
elif lng < 13.48:
return "Friedrichshain-Kreuzberg"
else:
return "Lichtenberg"
# Central districts
elif lat > 52.48:
if lng < 13.20:
return "Spandau"
elif lng < 13.30:
return "Charlottenburg-Wilmersdorf"
elif lng < 13.35:
return "Tempelhof-Schöneberg"
elif lng < 13.42:
return "Mitte"
elif lng < 13.48:
return "Friedrichshain-Kreuzberg"
else:
return "Lichtenberg"
# Southern-central districts
elif lat > 52.45:
if lng < 13.20:
return "Steglitz-Zehlendorf"
elif lng < 13.35:
return "Tempelhof-Schöneberg"
elif lng < 13.45:
return "Neukölln"
elif lng < 13.55:
return "Treptow-Köpenick"
else:
return "Marzahn-Hellersdorf"
# Southern districts
else:
if lng < 13.35:
return "Steglitz-Zehlendorf"
else:
return "Treptow-Köpenick"
def get_specific_neighborhood(district: str, lat: float, lng: float) -> str:
"""Get specific neighborhood within district based on coordinates."""
neighborhoods = {
"Mitte": {
(52.540, 52.560, 13.33, 13.38): "Wedding",
(52.515, 52.530, 13.33, 13.38): "Moabit",
(52.510, 52.520, 13.35, 13.38): "Tiergarten",
(52.525, 52.545, 13.40, 13.43): "Prenzlauer Berg"
},
"Charlottenburg-Wilmersdorf": {
(52.485, 52.505, 13.30, 13.33): "Wilmersdorf",
(52.505, 52.525, 13.25, 13.33): "Charlottenburg"
},
"Friedrichshain-Kreuzberg": {
(52.490, 52.510, 13.38, 13.42): "Kreuzberg",
(52.510, 52.525, 13.42, 13.48): "Friedrichshain"
},
"Tempelhof-Schöneberg": {
(52.480, 52.500, 13.33, 13.37): "Schöneberg",
(52.460, 52.480, 13.37, 13.42): "Tempelhof"
},
"Steglitz-Zehlendorf": {
(52.430, 52.450, 13.23, 13.30): "Zehlendorf",
(52.450, 52.470, 13.30, 13.35): "Steglitz"
},
"Treptow-Köpenick": {
(52.430, 52.460, 13.55, 13.65): "Köpenick",
(52.480, 52.500, 13.45, 13.50): "Treptow"
}
}
if district in neighborhoods:
for (min_lat, max_lat, min_lng, max_lng), neighborhood in neighborhoods[district].items():
if min_lat <= lat <= max_lat and min_lng <= lng <= max_lng:
return neighborhood
return district
async def quick_process():
"""Quick processing of significant Berlin green spaces."""
print("🚀 Quick Berlin Green Spaces Processor")
print("=" * 45)
# Initialize services
tree_service = StreetTreeService()
berlin_data = BerlinDataService()
# Pre-load and index trees once to avoid repeated indexing
print("🔄 Pre-loading tree data and building spatial index...")
await tree_service._load_trees()
osm_file = Path("app/data/osm-raw/berlin_green_spaces.osm")
if not osm_file.exists():
print("❌ OSM file not found. Please ensure data is downloaded.")
return
print("🔍 Quick filtering for named parks and significant areas...")
print(f"📁 OSM file size: {osm_file.stat().st_size / (1024*1024):.1f} MB")
# Quick scan for good candidates
candidates = []
try:
processed = 0
print("🔍 Single-pass XML parsing - ways with embedded coordinates...")
# Single pass: parse ways with embedded coordinates
ways_processed = 0
current_way_tags = {}
current_way_coordinates = []
in_way = False
for event, elem in iterparse(osm_file, events=('start', 'end')):
if event == 'start':
if elem.tag == 'way':
in_way = True
current_way_tags = {}
current_way_coordinates = []
ways_processed += 1
if ways_processed % 1000 == 0:
print(f"Processed {ways_processed} ways, found {len(candidates)} candidates so far...")
elif in_way and elem.tag == 'tag':
k = elem.get('k')
v = elem.get('v')
if k and v:
current_way_tags[k] = v
elif in_way and elem.tag == 'nd':
# Extract coordinates directly from nd element
lat = elem.get('lat')
lon = elem.get('lon')
if lat and lon:
current_way_coordinates.append((float(lat), float(lon)))
continue
if elem.tag == 'way' and in_way:
in_way = False
tags = current_way_tags
coordinates = current_way_coordinates
# Quick filters for promising spaces - be more lenient
has_name = 'name' in tags
is_park = (tags.get('leisure') in ['park', 'garden', 'nature_reserve'] or
tags.get('landuse') in ['forest', 'grass', 'recreation_ground'])
# Also accept common green space tags
has_green_tags = any(key in tags for key in ['leisure', 'landuse', 'natural', 'amenity'])
if not (has_name or is_park or has_green_tags):
elem.clear() # Free memory
continue
# Use embedded coordinates directly
if not coordinates:
elem.clear() # Free memory
continue
# Get center coordinate and all coordinates for area calculation
lat, lng = coordinates[0] if len(coordinates) == 1 else (
sum(lat for lat, lng in coordinates) / len(coordinates),
sum(lng for lat, lng in coordinates) / len(coordinates)
)
# Basic Berlin bounds check
if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8):
elem.clear() # Free memory
continue
name = tags.get('name', f"Unnamed {tags.get('leisure', tags.get('landuse', 'area'))}")
space_type = tags.get('leisure') or tags.get('landuse') or 'park'
candidate = {
'id': f"quick_{elem.get('id')}",
'name': name,
'type': space_type,
'lat': lat,
'lng': lng,
'has_name': has_name,
'tags': tags,
'coordinates': coordinates # Store all coordinates for area calculation
}
candidates.append(candidate)
processed += 1
# Limit for quick processing
if len(candidates) >= 100:
elem.clear() # Free memory
break
elem.clear() # Free memory
else:
elem.clear() # Free memory
print(f"✅ Found {len(candidates)} promising green spaces")
except Exception as e:
print(f"❌ Error in quick filtering: {e}")
return
if not candidates:
print("No candidates found")
return
# Sort by having names (better quality)
candidates.sort(key=lambda x: x['has_name'], reverse=True)
print(f"\n🔧 Enhancing top {len(candidates)} spaces with real data...")
# Process candidates in parallel with batching
batch_size = 10 # Process 10 candidates at a time
enhanced_spaces = []
async def process_candidate(candidate):
"""Process a single candidate with tree and toilet data."""
try:
# Calculate actual area from OSM polygon coordinates
area_sqm = calculate_polygon_area_sqm(candidate.get('coordinates', []))
search_radius = calculate_search_radius(area_sqm)
# Get real tree data and toilet data concurrently with dynamic radius
tree_task = tree_service.get_trees_near_location(
candidate['lat'], candidate['lng'], radius_m=search_radius
)
toilet_task = berlin_data.get_toilets_near_point(
candidate['lat'], candidate['lng'], 500
)
print(f"🔍 Getting data for {candidate['name'][:30]}... (area: {area_sqm/10000:.1f}ha, radius: {search_radius}m)")
tree_response, nearby_toilets = await asyncio.gather(tree_task, toilet_task)
# Create enhanced space
enhanced_space = {
"id": candidate['id'],
"name": candidate['name'],
"description": f"Berlin {candidate['type']} discovered via quick OSM processing",
"type": "PARK", # Simplified for now
"coordinates": {
"lat": candidate['lat'],
"lng": candidate['lng']
},
"neighborhood": get_specific_neighborhood(estimate_berlin_district(candidate['lat'], candidate['lng']), candidate['lat'], candidate['lng']),
"area_sqm": area_sqm, # Real calculated area
# Environmental features from real tree data
"environmental": {
"tree_coverage_percent": max(5, int(tree_response.metrics.shade_coverage_percent)), # Use actual crown area calculation
"shade_quality": calculate_enhanced_shade_quality(tree_response, area_sqm),
"noise_level": 2, # Default
"wildlife_diversity_score": tree_response.metrics.species_diversity_score,
"water_features": detect_water_features(candidate),
"natural_surface_percent": 80
},
# Real tree data
"tree_data": {
"total_trees": tree_response.metrics.total_trees,
"trees_per_hectare": tree_response.metrics.trees_per_hectare,
"species_count": len(tree_response.metrics.dominant_species),
"species_diversity_score": tree_response.metrics.species_diversity_score,
"mature_trees_count": tree_response.metrics.mature_trees_count,
"young_trees_count": tree_response.metrics.young_trees_count,
"average_tree_age": tree_response.metrics.average_tree_age,
"average_height": tree_response.metrics.average_height,
"average_crown_diameter": tree_response.metrics.average_crown_diameter,
"shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
"dominant_species": tree_response.metrics.dominant_species[:3]
},
# Real toilet data
"toilet_accessibility": {
"nearby_toilets_count": len(nearby_toilets),
"accessibility_score": 80 if nearby_toilets else 30,
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
"free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
"accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
},
# Standard features
"accessibility": {
"wheelchair_accessible": True,
"public_transport_score": 3,
"cycling_infrastructure": True,
"parking_availability": 2,
"lighting_quality": 3
},
"recreation": {
"playground_quality": 60 if candidate['type'] == 'park' else 30,
"sports_facilities": candidate['type'] == 'recreation_ground',
"running_paths": True,
"cycling_paths": True,
"dog_friendly": True,
"bbq_allowed": candidate['type'] in ['park', 'recreation_ground']
},
"osm_metadata": {
"has_official_name": candidate['has_name'],
"tags": candidate['tags'],
"source": "quick_osm_processing"
},
"last_updated": datetime.now().isoformat(),
"data_sources": ["quick_osm_scan", "berlin_tree_cadastre", "berlin_toilets"],
"confidence_score": 90 if candidate['has_name'] else 75
}
return enhanced_space, tree_response.metrics.total_trees, len(nearby_toilets)
except Exception as e:
print(f"❌ Error processing {candidate['name']}: {e}")
return None, 0, 0
# Process candidates in batches with progress bar
for i in range(0, len(candidates), batch_size):
batch = candidates[i:i + batch_size]
print(f"Processing batch {i//batch_size + 1}/{(len(candidates) + batch_size - 1)//batch_size}")
# Process batch concurrently with progress bar
tasks = [process_candidate(candidate) for candidate in batch]
results = await asyncio.gather(*tasks)
# Collect results
for result, trees, toilets in results:
if result:
enhanced_spaces.append(result)
print(f"{result['name'][:40]:40} - {trees:3d} trees, {toilets} toilets")
# Small delay between batches to be respectful to APIs
if i + batch_size < len(candidates):
await asyncio.sleep(0.5)
# Save results
output_file = Path("app/data/processed/quick_berlin_green_spaces.json")
with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)
data = {
"green_spaces": enhanced_spaces,
"total_count": len(enhanced_spaces),
"last_updated": datetime.now().isoformat(),
"data_sources": ["quick_osm_processing", "berlin_tree_cadastre", "berlin_toilets"],
"processing_info": {
"method": "quick_scan_for_named_and_significant_spaces",
"prioritizes_named_spaces": True,
"enhanced_with_real_berlin_data": True
},
"summary_stats": {
"total_spaces": len(enhanced_spaces),
"spaces_with_tree_data": with_trees,
"spaces_with_toilet_data": with_toilets,
"total_trees_analyzed": total_trees,
"tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%",
"toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%"
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Quick processing complete!")
print(f"📁 Saved: {output_file}")
print(f"📊 {len(enhanced_spaces)} spaces enhanced")
print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data")
print(f"🌿 {total_trees} total trees analyzed")
print(f"\n✨ Ready to use! This gives you real Berlin green spaces")
print(f" with actual tree and toilet data for personality scoring!")
if __name__ == "__main__":
asyncio.run(quick_process())