berlin-picnic-api/scripts/osm_local_processor.py

613 lines
24 KiB
Python

#!/usr/bin/env python3
"""
Process Berlin green spaces from local OSM data file.
Downloads Berlin OSM extract once, then processes locally without API dependencies.
"""
import json
import requests
import asyncio
import xml.etree.ElementTree as ET
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional, Tuple
import sys
import gzip
import math
# Add the app directory to Python path to import services
sys.path.append(str(Path(__file__).parent.parent))
from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService
class LocalOSMProcessor:
def __init__(self, data_dir: str = "app/data"):
self.data_dir = Path(data_dir)
self.raw_dir = self.data_dir / "osm-raw"
self.processed_dir = self.data_dir / "processed"
# Create directories
self.raw_dir.mkdir(parents=True, exist_ok=True)
self.processed_dir.mkdir(parents=True, exist_ok=True)
# Initialize existing services
self.tree_service = StreetTreeService()
self.berlin_data = BerlinDataService()
# Berlin bounding box for filtering
self.berlin_bbox = {
'min_lat': 52.3370, 'max_lat': 52.6755,
'min_lon': 13.0882, 'max_lon': 13.7611
}
def download_berlin_osm_extract(self):
"""Download Berlin OSM extract from Geofabrik."""
osm_file = self.raw_dir / "berlin-latest.osm.pbf"
if osm_file.exists():
print(f"✅ OSM file already exists: {osm_file}")
return osm_file
# Try PBF format first (smaller), fallback to XML
urls = [
"https://download.geofabrik.de/europe/germany/berlin-latest.osm.pbf",
"https://download.geofabrik.de/europe/germany/berlin-latest.osm.bz2"
]
for url in urls:
try:
print(f"Downloading Berlin OSM data from {url}")
print("This is a one-time download (~50MB)...")
response = requests.get(url, stream=True, timeout=300)
response.raise_for_status()
filename = url.split('/')[-1]
local_file = self.raw_dir / filename
# Download with progress
total_size = int(response.headers.get('content-length', 0))
downloaded = 0
with open(local_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
percent = (downloaded / total_size) * 100
print(f"\rDownload progress: {percent:.1f}%", end="")
print(f"\n✅ Downloaded: {local_file}")
return local_file
except Exception as e:
print(f"❌ Failed to download {url}: {e}")
continue
raise Exception("Could not download OSM data from any source")
def download_simple_osm_extract(self):
"""Download simpler XML format if PBF tools not available."""
osm_file = self.raw_dir / "berlin_green_spaces.osm"
if osm_file.exists():
print(f"✅ OSM file already exists: {osm_file}")
return osm_file
# Use Overpass API to get a one-time export of green spaces
print("Downloading Berlin green spaces extract...")
overpass_url = "http://overpass-api.de/api/interpreter"
# Query for all green spaces in Berlin (one-time download)
query = f"""
[out:xml][timeout:120];
(
way["leisure"~"^(park|garden|nature_reserve|recreation_ground|playground|common)$"]
({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
way["landuse"~"^(forest|grass|meadow|recreation_ground|village_green|allotments)$"]
({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
way["natural"~"^(forest|grass|meadow|scrub|heath|wood)$"]
({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
);
out geom meta;
"""
try:
response = requests.post(overpass_url, data=query, timeout=180)
response.raise_for_status()
with open(osm_file, 'w', encoding='utf-8') as f:
f.write(response.text)
print(f"✅ Downloaded green spaces extract: {osm_file}")
return osm_file
except Exception as e:
print(f"❌ Failed to download OSM extract: {e}")
raise
def parse_osm_xml(self, osm_file: Path) -> List[Dict]:
"""Parse OSM XML file to extract green spaces."""
print(f"Parsing OSM data from {osm_file}...")
green_spaces = []
try:
# Handle different file formats
if osm_file.suffix == '.gz':
with gzip.open(osm_file, 'rt', encoding='utf-8') as f:
tree = ET.parse(f)
else:
tree = ET.parse(osm_file)
root = tree.getroot()
# Parse ways (areas)
ways = root.findall('.//way')
print(f"Found {len(ways)} ways in OSM data")
for way in ways:
try:
processed_space = self._process_osm_way(way, root)
if processed_space:
green_spaces.append(processed_space)
except Exception as e:
continue
print(f"✅ Extracted {len(green_spaces)} green spaces from OSM data")
return green_spaces
except Exception as e:
print(f"❌ Error parsing OSM file: {e}")
return []
def _process_osm_way(self, way, root) -> Optional[Dict]:
"""Process a single OSM way into green space format."""
# Get tags
tags = {}
for tag in way.findall('tag'):
tags[tag.get('k')] = tag.get('v')
# Check if it's a green space
green_space_type = self._get_green_space_type(tags)
if not green_space_type:
return None
# Get node references
nd_refs = [nd.get('ref') for nd in way.findall('nd')]
if len(nd_refs) < 3: # Need at least 3 points for an area
return None
# Find node coordinates
coordinates = []
for nd_ref in nd_refs:
node = root.find(f".//node[@id='{nd_ref}']")
if node is not None:
lat = float(node.get('lat'))
lon = float(node.get('lon'))
# Check if within Berlin bounds
if (self.berlin_bbox['min_lat'] <= lat <= self.berlin_bbox['max_lat'] and
self.berlin_bbox['min_lon'] <= lon <= self.berlin_bbox['max_lon']):
coordinates.append((lat, lon))
if len(coordinates) < 3:
return None
# Calculate centroid and area
centroid_lat, centroid_lon = self._calculate_centroid(coordinates)
area_sqm = self._calculate_area(coordinates)
# Skip very small areas
if area_sqm < 500:
return None
# Get name
name = tags.get('name', f"{green_space_type.title()} near {centroid_lat:.3f}, {centroid_lon:.3f}")
# Estimate district
district = self._estimate_district(centroid_lat, centroid_lon)
return {
'id': f"osm_way_{way.get('id')}",
'name': name,
'fclass': green_space_type,
'lat': centroid_lat,
'lng': centroid_lon,
'area_sqm': int(area_sqm),
'district': district,
'osm_tags': tags,
'osm_id': way.get('id')
}
def _get_green_space_type(self, tags: Dict) -> Optional[str]:
"""Determine if tags represent a green space and what type."""
# Check leisure tags
leisure = tags.get('leisure', '')
if leisure in ['park', 'garden', 'nature_reserve', 'recreation_ground',
'playground', 'common', 'golf_course']:
return leisure
# Check landuse tags
landuse = tags.get('landuse', '')
if landuse in ['forest', 'grass', 'meadow', 'recreation_ground',
'village_green', 'allotments']:
return landuse
# Check natural tags
natural = tags.get('natural', '')
if natural in ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood']:
return natural
return None
def _calculate_centroid(self, coordinates: List[Tuple[float, float]]) -> Tuple[float, float]:
"""Calculate centroid of polygon."""
lat_sum = sum(coord[0] for coord in coordinates)
lon_sum = sum(coord[1] for coord in coordinates)
count = len(coordinates)
return lat_sum / count, lon_sum / count
def _calculate_area(self, coordinates: List[Tuple[float, float]]) -> float:
"""Calculate area of polygon using shoelace formula."""
if len(coordinates) < 3:
return 0
# Convert to approximate meters for Berlin
lat_to_m = 111000 # meters per degree latitude
lon_to_m = 111000 * math.cos(math.radians(52.5)) # adjust for Berlin latitude
# Convert coordinates to meters
coords_m = [(lat * lat_to_m, lon * lon_to_m) for lat, lon in coordinates]
# Shoelace formula
area = 0
n = len(coords_m)
for i in range(n):
j = (i + 1) % n
area += coords_m[i][0] * coords_m[j][1]
area -= coords_m[j][0] * coords_m[i][1]
return abs(area) / 2
def _estimate_district(self, lat: float, lng: float) -> str:
"""Rough district estimation from coordinates."""
# Very rough Berlin district boundaries
if lat > 52.55:
return "Pankow" if lng < 13.45 else "Lichtenberg"
elif lat > 52.52:
if lng < 13.25:
return "Charlottenburg-Wilmersdorf"
elif lng < 13.42:
return "Mitte"
else:
return "Friedrichshain-Kreuzberg"
elif lat > 52.45:
if lng < 13.25:
return "Steglitz-Zehlendorf"
elif lng < 13.42:
return "Tempelhof-Schöneberg"
else:
return "Neukölln"
else:
return "Treptow-Köpenick"
async def enhance_green_space_with_real_data(self, space_data: Dict):
"""Enhance green space with real tree and toilet data."""
try:
lat = space_data['lat']
lng = space_data['lng']
area_sqm = space_data['area_sqm']
print(f"Enhancing {space_data['name']} ({space_data['district']})...")
# Adaptive radius
radius = min(350, max(100, int((area_sqm ** 0.5) * 0.7)))
# Get real data using existing services
tree_response = await self.tree_service.get_trees_near_location(
lat, lng, radius_m=radius
)
nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 600)
# Calculate scores
toilet_score = self._score_toilet_accessibility(nearby_toilets)
space_type = self._map_to_space_type(space_data.get('fclass', ''))
enhanced_space = {
"id": space_data['id'],
"name": space_data['name'],
"description": f"Berlin {space_data.get('fclass', 'green space')} from local OSM data",
"type": space_type,
"coordinates": {
"lat": float(lat),
"lng": float(lng)
},
"neighborhood": space_data.get('district', 'Unknown'),
"area_sqm": area_sqm,
"perimeter_m": int(4 * (area_sqm ** 0.5)),
# Environmental features from real tree data
"environmental": {
"tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
"shade_quality": tree_response.shade_analysis.shade_quality_score,
"noise_level": self._estimate_noise_level(space_data),
"wildlife_diversity_score": tree_response.metrics.species_diversity_score,
"water_features": self._detect_water_features(space_data),
"natural_surface_percent": self._estimate_natural_surface(space_data.get('fclass', ''))
},
# Real tree metrics
"tree_data": {
"total_trees": tree_response.metrics.total_trees,
"trees_per_hectare": tree_response.metrics.trees_per_hectare,
"species_count": len(tree_response.metrics.dominant_species),
"species_diversity_score": tree_response.metrics.species_diversity_score,
"mature_trees_count": tree_response.metrics.mature_trees_count,
"young_trees_count": tree_response.metrics.young_trees_count,
"average_tree_age": tree_response.metrics.average_tree_age,
"average_height": tree_response.metrics.average_height,
"average_crown_diameter": tree_response.metrics.average_crown_diameter,
"shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
"dominant_species": tree_response.metrics.dominant_species[:3]
},
# Real toilet accessibility
"toilet_accessibility": {
"nearby_toilets_count": len(nearby_toilets),
"accessibility_score": toilet_score,
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
"free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
"accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
},
# Standard features
"accessibility": {
"wheelchair_accessible": True,
"public_transport_score": self._estimate_transport_score(space_data.get('district', '')),
"cycling_infrastructure": area_sqm > 4000,
"parking_availability": 2 if area_sqm > 20000 else 1,
"lighting_quality": 3 if 'mitte' in space_data.get('district', '').lower() else 2
},
"recreation": {
"playground_quality": self._estimate_playground_quality(space_data),
"sports_facilities": self._estimate_sports_facilities(space_data),
"running_paths": area_sqm > 6000,
"cycling_paths": area_sqm > 12000,
"dog_friendly": True,
"bbq_allowed": self._allows_bbq(space_data)
},
# OSM metadata
"osm_metadata": {
"osm_id": space_data.get('osm_id'),
"tags": space_data.get('osm_tags', {}),
"source": "local_osm_extract"
},
"last_updated": datetime.now().isoformat(),
"data_sources": ["local_osm_extract", "berlin_tree_cadastre", "berlin_toilets"],
"confidence_score": 92
}
trees = tree_response.metrics.total_trees
toilets = len(nearby_toilets)
print(f"{space_data['name']}: {trees} trees, {toilets} toilets")
return enhanced_space
except Exception as e:
print(f"❌ Error enhancing {space_data['name']}: {e}")
return None
def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
if not nearby_toilets:
return 25
nearest = nearby_toilets[0]['distance_meters']
if nearest <= 200:
score = 90
elif nearest <= 400:
score = 70
else:
score = 50
# Quality bonuses
free = len([t for t in nearby_toilets if t.get('is_free', False)])
accessible = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
score += min(10, free * 5 + accessible * 3)
return min(100, score)
def _map_to_space_type(self, fclass: str) -> str:
mapping = {
'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN', 'wood': 'FOREST',
'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
'common': 'PARK', 'village_green': 'GRASS', 'allotments': 'GARDEN'
}
return mapping.get(fclass, 'PARK')
def _detect_water_features(self, space_data: Dict) -> bool:
name = space_data.get('name', '').lower()
tags = space_data.get('osm_tags', {})
water_keywords = ['see', 'teich', 'pond', 'lake', 'bach', 'spree', 'wasser']
return any(keyword in name for keyword in water_keywords) or 'water' in tags.values()
def _estimate_noise_level(self, space_data: Dict) -> int:
fclass = space_data.get('fclass', '')
district = space_data.get('district', '')
base = {'forest': 1, 'wood': 1, 'nature_reserve': 1, 'meadow': 2,
'park': 2, 'garden': 2, 'playground': 3}.get(fclass, 2)
if any(busy in district.lower() for busy in ['mitte', 'kreuzberg', 'friedrichshain']):
base += 1
return min(5, base)
def _estimate_natural_surface(self, fclass: str) -> int:
return {'forest': 95, 'wood': 95, 'nature_reserve': 90, 'meadow': 95,
'grass': 85, 'park': 75, 'garden': 65, 'playground': 40}.get(fclass, 70)
def _estimate_transport_score(self, district: str) -> int:
district_lower = district.lower()
if 'mitte' in district_lower:
return 5
elif any(name in district_lower for name in ['charlottenburg', 'kreuzberg', 'friedrichshain']):
return 4
else:
return 3
def _estimate_playground_quality(self, space_data: Dict) -> int:
fclass = space_data.get('fclass', '')
tags = space_data.get('osm_tags', {})
if fclass == 'playground':
return 80
elif 'playground' in tags.values():
return 75
elif fclass == 'park':
return 55
else:
return 30
def _estimate_sports_facilities(self, space_data: Dict) -> bool:
fclass = space_data.get('fclass', '')
tags = space_data.get('osm_tags', {})
name = space_data.get('name', '').lower()
return (fclass == 'recreation_ground' or
'sport' in str(tags.values()).lower() or
any(term in name for term in ['sport', 'football', 'tennis']))
def _allows_bbq(self, space_data: Dict) -> bool:
fclass = space_data.get('fclass', '')
area = space_data.get('area_sqm', 0)
tags = space_data.get('osm_tags', {})
# Check explicit BBQ tags
if tags.get('bbq') == 'yes':
return True
elif tags.get('bbq') == 'no':
return False
# Default based on type and size
return fclass in ['park', 'recreation_ground'] and area > 5000
async def process_all_green_spaces(self):
"""Main processing pipeline."""
print("🌳 Processing Berlin green spaces from local OSM data...")
# Step 1: Get OSM data
try:
osm_file = self.download_simple_osm_extract() # More reliable than PBF
except:
print("❌ Could not download OSM data")
return []
# Step 2: Parse green spaces
green_spaces = self.parse_osm_xml(osm_file)
if not green_spaces:
print("❌ No green spaces found in OSM data")
return []
print(f"📊 Found {len(green_spaces)} green spaces to enhance")
# Step 3: Enhance with real data
enhanced_spaces = []
for i, space_data in enumerate(green_spaces, 1):
print(f"[{i}/{len(green_spaces)}]", end=" ")
result = await self.enhance_green_space_with_real_data(space_data)
if result:
enhanced_spaces.append(result)
if i % 20 == 0:
print(f"\n Progress: {len(enhanced_spaces)} enhanced so far...")
await asyncio.sleep(0.1)
print(f"\n✅ Enhanced {len(enhanced_spaces)} spaces with real data!")
return enhanced_spaces
def save_enhanced_data(self, enhanced_spaces: List[Dict]):
"""Save the final dataset."""
output_file = self.processed_dir / "osm_berlin_green_spaces_enhanced.json"
# Calculate statistics
with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)
data = {
"green_spaces": enhanced_spaces,
"total_count": len(enhanced_spaces),
"last_updated": datetime.now().isoformat(),
"data_sources": [
"local_osm_extract_processed_offline",
"berlin_tree_cadastre",
"berlin_toilets"
],
"processing_info": {
"method": "local_osm_processing_no_api_dependency",
"includes_all_osm_green_spaces": True,
"enhanced_with_real_berlin_data": True
},
"summary_stats": {
"total_spaces": len(enhanced_spaces),
"spaces_with_tree_data": with_trees,
"spaces_with_toilet_data": with_toilets,
"total_trees_analyzed": total_trees,
"tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%",
"toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%"
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Saved comprehensive dataset: {output_file}")
print(f"📊 {len(enhanced_spaces)} total green spaces")
print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data")
print(f"🌿 {total_trees} total trees analyzed")
print(f"\n✨ Ready to replace mock data in your API!")
return output_file
async def main():
processor = LocalOSMProcessor()
try:
print("🚀 Berlin Green Spaces: Local OSM Processing")
print("=" * 50)
print("• Downloads OSM data once (no API dependency)")
print("• Processes locally for all green spaces")
print("• Enhances with real Berlin tree + toilet data")
print("=" * 50)
enhanced_spaces = await processor.process_all_green_spaces()
if enhanced_spaces:
processor.save_enhanced_data(enhanced_spaces)
except KeyboardInterrupt:
print("\n⚠️ Interrupted")
except Exception as e:
print(f"❌ Error: {e}")
if __name__ == "__main__":
asyncio.run(main())