berlin-picnic-api/scripts/enhance_green_spaces_with_r...

467 lines
20 KiB
Python

#!/usr/bin/env python3
"""
Enhanced Berlin green space processor using existing tree and toilet services.
Downloads OSM green space boundaries and enhances them with real data using existing services.
"""
import os
import json
import zipfile
import requests
import asyncio
from pathlib import Path
import geopandas as gpd
import pandas as pd
from datetime import datetime
from typing import List, Dict, Optional
import sys
# Add the app directory to Python path to import services
sys.path.append(str(Path(__file__).parent.parent))
from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService
class RealDataGreenSpaceProcessor:
def __init__(self, data_dir: str = "app/data"):
self.data_dir = Path(data_dir)
self.raw_dir = self.data_dir / "geo-raw"
self.processed_dir = self.data_dir / "processed"
# Create directories
self.raw_dir.mkdir(parents=True, exist_ok=True)
self.processed_dir.mkdir(parents=True, exist_ok=True)
# Initialize existing services
self.tree_service = StreetTreeService()
self.berlin_data = BerlinDataService()
def download_berlin_districts(self):
"""Download Berlin district boundaries."""
json_file = self.raw_dir / "bezirksgrenzen.geojson"
if json_file.exists():
print(f"Berlin district data already exists: {json_file}")
return json_file
link = "https://tsb-opendata.s3.eu-central-1.amazonaws.com/bezirksgrenzen/bezirksgrenzen.geojson"
print(f"Downloading Berlin district data from {link}")
try:
response = requests.get(link, timeout=30)
response.raise_for_status()
with open(json_file, 'wb') as f:
f.write(response.content)
print(f"Downloaded to {json_file}")
return json_file
except Exception as e:
print(f"Error downloading districts: {e}")
raise
def download_osm_data(self):
"""Download Berlin OpenStreetMap data."""
zip_file = self.raw_dir / "berlin_shapes.zip"
shp_dir = self.raw_dir / "berlin_shapes"
# Check if already extracted
required_files = ["gis_osm_landuse_a_free_1.shp", "gis_osm_natural_a_free_1.shp", "gis_osm_leisure_a_free_1.shp"]
if all((shp_dir / f).exists() for f in required_files):
print(f"Berlin OSM data already exists: {shp_dir}")
return shp_dir
if not zip_file.exists():
link = "https://download.geofabrik.de/europe/germany/berlin-latest-free.shp.zip"
print(f"Downloading Berlin OSM data from {link} (this may take several minutes...)")
try:
response = requests.get(link, stream=True, timeout=300) # 5 min timeout
response.raise_for_status()
with open(zip_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Download completed: {zip_file}")
except Exception as e:
print(f"Error downloading OSM data: {e}")
raise
print(f"Extracting Berlin OSM data to {shp_dir}")
try:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(shp_dir)
print(f"Extracted to {shp_dir}")
except Exception as e:
print(f"Error extracting OSM data: {e}")
raise
return shp_dir
def load_osm_green_spaces(self):
"""Load OSM green space polygons."""
print("Loading OSM green space boundaries...")
# Download required data
districts_file = self.download_berlin_districts()
shp_dir = self.download_osm_data()
# Load Berlin districts for clipping
districts = gpd.read_file(districts_file)
# Define green space categories we want
green_categories = {
'landuse': ['forest', 'grass', 'meadow', 'recreation_ground', 'village_green', 'allotments'],
'natural': ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood'],
'leisure': ['park', 'garden', 'nature_reserve', 'playground', 'pitch', 'common', 'golf_course']
}
all_green_spaces = []
# Process each category
for category, subcategories in green_categories.items():
shapefile = shp_dir / f"gis_osm_{category}_a_free_1.shp"
if not shapefile.exists():
print(f"Warning: {shapefile} not found, skipping")
continue
print(f"Processing {category} data...")
try:
gdf = gpd.read_file(shapefile)
# Filter to relevant subcategories
gdf_filtered = gdf[gdf['fclass'].isin(subcategories)].copy()
if len(gdf_filtered) == 0:
print(f"No {category} features found in subcategories")
continue
# Clip to Berlin boundaries
gdf_clipped = gpd.clip(gdf_filtered, districts)
# Calculate area and filter out very small areas (< 1000 sqm)
gdf_clipped['area_sqm'] = gdf_clipped.geometry.area
gdf_clipped = gdf_clipped[gdf_clipped['area_sqm'] >= 1000]
if len(gdf_clipped) > 0:
all_green_spaces.append(gdf_clipped)
print(f"Found {len(gdf_clipped)} {category} features")
except Exception as e:
print(f"Error processing {category}: {e}")
continue
if not all_green_spaces:
raise ValueError("No green space data found")
# Combine all green spaces
green_spaces = gpd.GeoDataFrame(pd.concat(all_green_spaces, ignore_index=True))
# Add district information
green_spaces = gpd.sjoin(green_spaces, districts[['Bezirk', 'geometry']], how='left')
# Calculate centroids for analysis
green_spaces['centroid'] = green_spaces.geometry.centroid
green_spaces['centroid_lat'] = green_spaces.centroid.y
green_spaces['centroid_lng'] = green_spaces.centroid.x
print(f"Total green spaces found: {len(green_spaces)}")
return green_spaces
async def enhance_green_space_with_real_data(self, row):
"""Enhance a single green space with real tree and toilet data."""
try:
lat = row['centroid_lat']
lng = row['centroid_lng']
area_sqm = int(row['area_sqm'])
# Use existing tree service to get real tree data
tree_response = await self.tree_service.get_trees_near_location(
lat, lng, radius_m=min(400, int((area_sqm ** 0.5) * 1.5)) # Adaptive radius
)
# Use existing toilet service to get real toilet data
nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 800)
# Calculate toilet accessibility score
toilet_score = self._score_toilet_accessibility(nearby_toilets)
# Map OSM type to our enum
space_type = self._map_osm_to_space_type(row.get('fclass', ''))
# Generate ID
space_id = f"real_{row.get('fclass', 'unknown')}_{row.name}"
# Create enhanced green space using real data
enhanced_space = {
"id": space_id,
"name": row.get('name') or f"{row.get('fclass', 'Green Space').title()} in {row.get('Bezirk', 'Berlin')}",
"description": f"Real Berlin {row.get('fclass', 'green space')} enhanced with tree and toilet data",
"type": space_type,
"coordinates": {
"lat": float(lat),
"lng": float(lng)
},
"neighborhood": row.get('Bezirk', 'Unknown'),
"area_sqm": area_sqm,
"perimeter_m": int(row.geometry.length) if hasattr(row.geometry, 'length') else 0,
# Environmental features using real tree data
"environmental": {
"tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
"shade_quality": tree_response.shade_analysis.shade_quality_score,
"noise_level": self._estimate_noise_level(row.get('fclass', ''), row.get('Bezirk', '')),
"wildlife_diversity_score": tree_response.metrics.species_diversity_score,
"water_features": 'water' in str(row.get('fclass', '')).lower() or 'river' in str(row.get('name', '')).lower(),
"natural_surface_percent": self._estimate_natural_surface(row.get('fclass', ''))
},
# Real tree metrics from existing service
"tree_data": {
"total_trees": tree_response.metrics.total_trees,
"trees_per_hectare": tree_response.metrics.trees_per_hectare,
"species_count": len(tree_response.metrics.dominant_species),
"species_diversity_score": tree_response.metrics.species_diversity_score,
"mature_trees_count": tree_response.metrics.mature_trees_count,
"young_trees_count": tree_response.metrics.young_trees_count,
"average_tree_age": tree_response.metrics.average_tree_age,
"average_height": tree_response.metrics.average_height,
"average_crown_diameter": tree_response.metrics.average_crown_diameter,
"shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
"dominant_species": tree_response.metrics.dominant_species
},
# Real toilet accessibility from existing service
"toilet_accessibility": {
"nearby_toilets_count": len(nearby_toilets),
"accessibility_score": toilet_score,
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
"free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
"accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
},
# Standard accessibility features
"accessibility": {
"wheelchair_accessible": True,
"public_transport_score": 3, # Could be enhanced with real transit data
"cycling_infrastructure": area_sqm > 5000,
"parking_availability": 2,
"lighting_quality": 2
},
# Recreation features based on OSM data and size
"recreation": {
"playground_quality": self._estimate_playground_quality(row.get('fclass', ''), tree_response.metrics.total_trees),
"sports_facilities": 'pitch' in str(row.get('fclass', '')).lower() or 'sport' in str(row.get('name', '')).lower(),
"running_paths": area_sqm > 8000,
"cycling_paths": area_sqm > 15000,
"dog_friendly": True,
"bbq_allowed": row.get('fclass') in ['park', 'recreation_ground'] and area_sqm > 5000
},
"last_updated": datetime.now().isoformat(),
"data_sources": ["openstreetmap", "berlin_tree_cadastre", "berlin_toilets"],
"confidence_score": 95
}
return enhanced_space
except Exception as e:
print(f"Error enhancing green space {row.name}: {e}")
return None
def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
"""Score toilet accessibility using existing toilet data."""
if not nearby_toilets:
return 20
nearest_distance = nearby_toilets[0]['distance_meters']
# Distance-based scoring
if nearest_distance <= 200:
score = 100
elif nearest_distance <= 400:
score = 80
elif nearest_distance <= 600:
score = 60
else:
score = 40
# Bonuses for quality
free_toilets = len([t for t in nearby_toilets if t.get('is_free', False)])
accessible_toilets = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
score += min(20, free_toilets * 5 + accessible_toilets * 3)
return min(100, score)
def _map_osm_to_space_type(self, fclass: str) -> str:
"""Map OSM feature class to green space types."""
mapping = {
'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN',
'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
'wood': 'FOREST', 'heath': 'HEATH', 'pitch': 'SPORTS_AREA',
'golf_course': 'SPORTS_AREA', 'common': 'PARK', 'village_green': 'GRASS',
'allotments': 'GARDEN'
}
return mapping.get(fclass, 'PARK')
def _estimate_noise_level(self, fclass: str, district: str) -> int:
"""Estimate noise level (1=very quiet, 5=very noisy)."""
base_noise = {
'forest': 1, 'nature_reserve': 1, 'wood': 1,
'meadow': 2, 'grass': 2, 'heath': 2,
'park': 2, 'garden': 2, 'common': 2,
'recreation_ground': 3, 'playground': 3, 'pitch': 3,
'golf_course': 2, 'allotments': 2
}
# Central districts are noisier
central_districts = ['Mitte', 'Kreuzberg', 'Friedrichshain']
district_modifier = 1 if district in central_districts else 0
return min(5, base_noise.get(fclass, 2) + district_modifier)
def _estimate_natural_surface(self, fclass: str) -> int:
"""Estimate percentage of natural surface."""
surface_map = {
'forest': 95, 'nature_reserve': 95, 'wood': 95,
'meadow': 95, 'grass': 90, 'heath': 90,
'park': 75, 'garden': 65, 'common': 80,
'recreation_ground': 60, 'playground': 40, 'pitch': 20,
'golf_course': 70, 'allotments': 85
}
return surface_map.get(fclass, 70)
def _estimate_playground_quality(self, fclass: str, tree_count: int) -> int:
"""Estimate playground quality score."""
base_scores = {
'playground': 85,
'park': 65,
'recreation_ground': 70,
'garden': 40,
'common': 50
}
base = base_scores.get(fclass, 25)
# Trees improve playground appeal for families
tree_bonus = min(15, tree_count // 5) # +3 per 5 trees, max 15
return min(100, base + tree_bonus)
async def process_all_green_spaces(self):
"""Process all green spaces with real data enhancement."""
print("Starting enhanced green space processing with real data...")
# Load OSM green space boundaries
osm_green_spaces = self.load_osm_green_spaces()
enhanced_green_spaces = []
print(f"Enhancing {len(osm_green_spaces)} green spaces with real tree and toilet data...")
# Process in batches to avoid overwhelming the system
batch_size = 50
total_processed = 0
for i in range(0, len(osm_green_spaces), batch_size):
batch = osm_green_spaces.iloc[i:i+batch_size]
batch_results = []
for idx, row in batch.iterrows():
result = await self.enhance_green_space_with_real_data(row)
if result:
batch_results.append(result)
total_processed += 1
if total_processed % 25 == 0:
print(f"Processed {total_processed}/{len(osm_green_spaces)} green spaces...")
enhanced_green_spaces.extend(batch_results)
# Small delay between batches
await asyncio.sleep(0.1)
print(f"Successfully enhanced {len(enhanced_green_spaces)} green spaces with real data")
return enhanced_green_spaces
def save_enhanced_data(self, enhanced_green_spaces: List[Dict]):
"""Save enhanced green spaces to JSON file."""
output_file = self.processed_dir / "real_berlin_green_spaces.json"
# Calculate summary statistics
spaces_with_trees = len([gs for gs in enhanced_green_spaces if gs["tree_data"]["total_trees"] > 0])
spaces_with_toilets = len([gs for gs in enhanced_green_spaces if gs["toilet_accessibility"]["nearby_toilets_count"] > 0])
total_trees = sum(gs["tree_data"]["total_trees"] for gs in enhanced_green_spaces)
avg_species_per_space = sum(gs["tree_data"]["species_count"] for gs in enhanced_green_spaces) / len(enhanced_green_spaces) if enhanced_green_spaces else 0
data = {
"green_spaces": enhanced_green_spaces,
"total_count": len(enhanced_green_spaces),
"last_updated": datetime.now().isoformat(),
"data_sources": [
"openstreetmap_boundaries",
"berlin_tree_cadastre_via_service",
"berlin_toilet_locations_via_service",
"berlin_districts"
],
"processing_info": {
"script_version": "1.0",
"coordinate_system": "WGS84",
"uses_existing_services": True,
"tree_analysis_via": "StreetTreeService",
"toilet_analysis_via": "BerlinDataService"
},
"summary_stats": {
"spaces_with_trees": spaces_with_trees,
"spaces_with_nearby_toilets": spaces_with_toilets,
"total_trees_in_all_spaces": total_trees,
"average_species_per_space": round(avg_species_per_space, 1),
"coverage_percentage": {
"with_tree_data": round((spaces_with_trees / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0,
"with_toilet_data": round((spaces_with_toilets / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0
}
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"✅ Saved {len(enhanced_green_spaces)} enhanced green spaces to {output_file}")
print(f"📊 Summary:")
print(f" - {spaces_with_trees} spaces have tree data ({round((spaces_with_trees/len(enhanced_green_spaces))*100, 1)}%)")
print(f" - {spaces_with_toilets} spaces have nearby toilets ({round((spaces_with_toilets/len(enhanced_green_spaces))*100, 1)}%)")
print(f" - {total_trees} total trees analyzed")
print(f" - {avg_species_per_space:.1f} average species per space")
return output_file
async def main():
"""Main processing function."""
processor = RealDataGreenSpaceProcessor()
try:
# Process enhanced green spaces using existing services
enhanced_green_spaces = await processor.process_all_green_spaces()
# Save enhanced data
output_file = processor.save_enhanced_data(enhanced_green_spaces)
print(f"\n🎉 Successfully created real data enhanced Berlin green spaces!")
print(f"📁 Output: {output_file}")
except KeyboardInterrupt:
print("\n⚠️ Processing interrupted by user")
except Exception as e:
print(f"❌ Error processing data: {e}")
raise
if __name__ == "__main__":
asyncio.run(main())