467 lines
20 KiB
Python
467 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enhanced Berlin green space processor using existing tree and toilet services.
|
|
Downloads OSM green space boundaries and enhances them with real data using existing services.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import zipfile
|
|
import requests
|
|
import asyncio
|
|
from pathlib import Path
|
|
import geopandas as gpd
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from typing import List, Dict, Optional
|
|
import sys
|
|
|
|
# Add the app directory to Python path to import services
|
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
|
|
from app.services.street_tree_service import StreetTreeService
|
|
from app.services.berlin_data_service import BerlinDataService
|
|
|
|
|
|
class RealDataGreenSpaceProcessor:
|
|
def __init__(self, data_dir: str = "app/data"):
|
|
self.data_dir = Path(data_dir)
|
|
self.raw_dir = self.data_dir / "geo-raw"
|
|
self.processed_dir = self.data_dir / "processed"
|
|
|
|
# Create directories
|
|
self.raw_dir.mkdir(parents=True, exist_ok=True)
|
|
self.processed_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize existing services
|
|
self.tree_service = StreetTreeService()
|
|
self.berlin_data = BerlinDataService()
|
|
|
|
def download_berlin_districts(self):
|
|
"""Download Berlin district boundaries."""
|
|
json_file = self.raw_dir / "bezirksgrenzen.geojson"
|
|
|
|
if json_file.exists():
|
|
print(f"Berlin district data already exists: {json_file}")
|
|
return json_file
|
|
|
|
link = "https://tsb-opendata.s3.eu-central-1.amazonaws.com/bezirksgrenzen/bezirksgrenzen.geojson"
|
|
print(f"Downloading Berlin district data from {link}")
|
|
|
|
try:
|
|
response = requests.get(link, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
with open(json_file, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
print(f"Downloaded to {json_file}")
|
|
return json_file
|
|
except Exception as e:
|
|
print(f"Error downloading districts: {e}")
|
|
raise
|
|
|
|
def download_osm_data(self):
|
|
"""Download Berlin OpenStreetMap data."""
|
|
zip_file = self.raw_dir / "berlin_shapes.zip"
|
|
shp_dir = self.raw_dir / "berlin_shapes"
|
|
|
|
# Check if already extracted
|
|
required_files = ["gis_osm_landuse_a_free_1.shp", "gis_osm_natural_a_free_1.shp", "gis_osm_leisure_a_free_1.shp"]
|
|
if all((shp_dir / f).exists() for f in required_files):
|
|
print(f"Berlin OSM data already exists: {shp_dir}")
|
|
return shp_dir
|
|
|
|
if not zip_file.exists():
|
|
link = "https://download.geofabrik.de/europe/germany/berlin-latest-free.shp.zip"
|
|
print(f"Downloading Berlin OSM data from {link} (this may take several minutes...)")
|
|
|
|
try:
|
|
response = requests.get(link, stream=True, timeout=300) # 5 min timeout
|
|
response.raise_for_status()
|
|
|
|
with open(zip_file, 'wb') as f:
|
|
for chunk in response.iter_content(chunk_size=8192):
|
|
f.write(chunk)
|
|
|
|
print(f"Download completed: {zip_file}")
|
|
except Exception as e:
|
|
print(f"Error downloading OSM data: {e}")
|
|
raise
|
|
|
|
print(f"Extracting Berlin OSM data to {shp_dir}")
|
|
try:
|
|
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
|
|
zip_ref.extractall(shp_dir)
|
|
print(f"Extracted to {shp_dir}")
|
|
except Exception as e:
|
|
print(f"Error extracting OSM data: {e}")
|
|
raise
|
|
|
|
return shp_dir
|
|
|
|
def load_osm_green_spaces(self):
|
|
"""Load OSM green space polygons."""
|
|
print("Loading OSM green space boundaries...")
|
|
|
|
# Download required data
|
|
districts_file = self.download_berlin_districts()
|
|
shp_dir = self.download_osm_data()
|
|
|
|
# Load Berlin districts for clipping
|
|
districts = gpd.read_file(districts_file)
|
|
|
|
# Define green space categories we want
|
|
green_categories = {
|
|
'landuse': ['forest', 'grass', 'meadow', 'recreation_ground', 'village_green', 'allotments'],
|
|
'natural': ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood'],
|
|
'leisure': ['park', 'garden', 'nature_reserve', 'playground', 'pitch', 'common', 'golf_course']
|
|
}
|
|
|
|
all_green_spaces = []
|
|
|
|
# Process each category
|
|
for category, subcategories in green_categories.items():
|
|
shapefile = shp_dir / f"gis_osm_{category}_a_free_1.shp"
|
|
|
|
if not shapefile.exists():
|
|
print(f"Warning: {shapefile} not found, skipping")
|
|
continue
|
|
|
|
print(f"Processing {category} data...")
|
|
try:
|
|
gdf = gpd.read_file(shapefile)
|
|
|
|
# Filter to relevant subcategories
|
|
gdf_filtered = gdf[gdf['fclass'].isin(subcategories)].copy()
|
|
|
|
if len(gdf_filtered) == 0:
|
|
print(f"No {category} features found in subcategories")
|
|
continue
|
|
|
|
# Clip to Berlin boundaries
|
|
gdf_clipped = gpd.clip(gdf_filtered, districts)
|
|
|
|
# Calculate area and filter out very small areas (< 1000 sqm)
|
|
gdf_clipped['area_sqm'] = gdf_clipped.geometry.area
|
|
gdf_clipped = gdf_clipped[gdf_clipped['area_sqm'] >= 1000]
|
|
|
|
if len(gdf_clipped) > 0:
|
|
all_green_spaces.append(gdf_clipped)
|
|
print(f"Found {len(gdf_clipped)} {category} features")
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {category}: {e}")
|
|
continue
|
|
|
|
if not all_green_spaces:
|
|
raise ValueError("No green space data found")
|
|
|
|
# Combine all green spaces
|
|
green_spaces = gpd.GeoDataFrame(pd.concat(all_green_spaces, ignore_index=True))
|
|
|
|
# Add district information
|
|
green_spaces = gpd.sjoin(green_spaces, districts[['Bezirk', 'geometry']], how='left')
|
|
|
|
# Calculate centroids for analysis
|
|
green_spaces['centroid'] = green_spaces.geometry.centroid
|
|
green_spaces['centroid_lat'] = green_spaces.centroid.y
|
|
green_spaces['centroid_lng'] = green_spaces.centroid.x
|
|
|
|
print(f"Total green spaces found: {len(green_spaces)}")
|
|
return green_spaces
|
|
|
|
async def enhance_green_space_with_real_data(self, row):
|
|
"""Enhance a single green space with real tree and toilet data."""
|
|
try:
|
|
lat = row['centroid_lat']
|
|
lng = row['centroid_lng']
|
|
area_sqm = int(row['area_sqm'])
|
|
|
|
# Use existing tree service to get real tree data
|
|
tree_response = await self.tree_service.get_trees_near_location(
|
|
lat, lng, radius_m=min(400, int((area_sqm ** 0.5) * 1.5)) # Adaptive radius
|
|
)
|
|
|
|
# Use existing toilet service to get real toilet data
|
|
nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 800)
|
|
|
|
# Calculate toilet accessibility score
|
|
toilet_score = self._score_toilet_accessibility(nearby_toilets)
|
|
|
|
# Map OSM type to our enum
|
|
space_type = self._map_osm_to_space_type(row.get('fclass', ''))
|
|
|
|
# Generate ID
|
|
space_id = f"real_{row.get('fclass', 'unknown')}_{row.name}"
|
|
|
|
# Create enhanced green space using real data
|
|
enhanced_space = {
|
|
"id": space_id,
|
|
"name": row.get('name') or f"{row.get('fclass', 'Green Space').title()} in {row.get('Bezirk', 'Berlin')}",
|
|
"description": f"Real Berlin {row.get('fclass', 'green space')} enhanced with tree and toilet data",
|
|
"type": space_type,
|
|
"coordinates": {
|
|
"lat": float(lat),
|
|
"lng": float(lng)
|
|
},
|
|
"neighborhood": row.get('Bezirk', 'Unknown'),
|
|
"area_sqm": area_sqm,
|
|
"perimeter_m": int(row.geometry.length) if hasattr(row.geometry, 'length') else 0,
|
|
|
|
# Environmental features using real tree data
|
|
"environmental": {
|
|
"tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
|
|
"shade_quality": tree_response.shade_analysis.shade_quality_score,
|
|
"noise_level": self._estimate_noise_level(row.get('fclass', ''), row.get('Bezirk', '')),
|
|
"wildlife_diversity_score": tree_response.metrics.species_diversity_score,
|
|
"water_features": 'water' in str(row.get('fclass', '')).lower() or 'river' in str(row.get('name', '')).lower(),
|
|
"natural_surface_percent": self._estimate_natural_surface(row.get('fclass', ''))
|
|
},
|
|
|
|
# Real tree metrics from existing service
|
|
"tree_data": {
|
|
"total_trees": tree_response.metrics.total_trees,
|
|
"trees_per_hectare": tree_response.metrics.trees_per_hectare,
|
|
"species_count": len(tree_response.metrics.dominant_species),
|
|
"species_diversity_score": tree_response.metrics.species_diversity_score,
|
|
"mature_trees_count": tree_response.metrics.mature_trees_count,
|
|
"young_trees_count": tree_response.metrics.young_trees_count,
|
|
"average_tree_age": tree_response.metrics.average_tree_age,
|
|
"average_height": tree_response.metrics.average_height,
|
|
"average_crown_diameter": tree_response.metrics.average_crown_diameter,
|
|
"shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
|
|
"dominant_species": tree_response.metrics.dominant_species
|
|
},
|
|
|
|
# Real toilet accessibility from existing service
|
|
"toilet_accessibility": {
|
|
"nearby_toilets_count": len(nearby_toilets),
|
|
"accessibility_score": toilet_score,
|
|
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
|
|
"free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
|
|
"accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
|
|
},
|
|
|
|
# Standard accessibility features
|
|
"accessibility": {
|
|
"wheelchair_accessible": True,
|
|
"public_transport_score": 3, # Could be enhanced with real transit data
|
|
"cycling_infrastructure": area_sqm > 5000,
|
|
"parking_availability": 2,
|
|
"lighting_quality": 2
|
|
},
|
|
|
|
# Recreation features based on OSM data and size
|
|
"recreation": {
|
|
"playground_quality": self._estimate_playground_quality(row.get('fclass', ''), tree_response.metrics.total_trees),
|
|
"sports_facilities": 'pitch' in str(row.get('fclass', '')).lower() or 'sport' in str(row.get('name', '')).lower(),
|
|
"running_paths": area_sqm > 8000,
|
|
"cycling_paths": area_sqm > 15000,
|
|
"dog_friendly": True,
|
|
"bbq_allowed": row.get('fclass') in ['park', 'recreation_ground'] and area_sqm > 5000
|
|
},
|
|
|
|
"last_updated": datetime.now().isoformat(),
|
|
"data_sources": ["openstreetmap", "berlin_tree_cadastre", "berlin_toilets"],
|
|
"confidence_score": 95
|
|
}
|
|
|
|
return enhanced_space
|
|
|
|
except Exception as e:
|
|
print(f"Error enhancing green space {row.name}: {e}")
|
|
return None
|
|
|
|
def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
|
|
"""Score toilet accessibility using existing toilet data."""
|
|
if not nearby_toilets:
|
|
return 20
|
|
|
|
nearest_distance = nearby_toilets[0]['distance_meters']
|
|
|
|
# Distance-based scoring
|
|
if nearest_distance <= 200:
|
|
score = 100
|
|
elif nearest_distance <= 400:
|
|
score = 80
|
|
elif nearest_distance <= 600:
|
|
score = 60
|
|
else:
|
|
score = 40
|
|
|
|
# Bonuses for quality
|
|
free_toilets = len([t for t in nearby_toilets if t.get('is_free', False)])
|
|
accessible_toilets = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
|
|
|
|
score += min(20, free_toilets * 5 + accessible_toilets * 3)
|
|
|
|
return min(100, score)
|
|
|
|
def _map_osm_to_space_type(self, fclass: str) -> str:
|
|
"""Map OSM feature class to green space types."""
|
|
mapping = {
|
|
'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN',
|
|
'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
|
|
'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
|
|
'wood': 'FOREST', 'heath': 'HEATH', 'pitch': 'SPORTS_AREA',
|
|
'golf_course': 'SPORTS_AREA', 'common': 'PARK', 'village_green': 'GRASS',
|
|
'allotments': 'GARDEN'
|
|
}
|
|
return mapping.get(fclass, 'PARK')
|
|
|
|
def _estimate_noise_level(self, fclass: str, district: str) -> int:
|
|
"""Estimate noise level (1=very quiet, 5=very noisy)."""
|
|
base_noise = {
|
|
'forest': 1, 'nature_reserve': 1, 'wood': 1,
|
|
'meadow': 2, 'grass': 2, 'heath': 2,
|
|
'park': 2, 'garden': 2, 'common': 2,
|
|
'recreation_ground': 3, 'playground': 3, 'pitch': 3,
|
|
'golf_course': 2, 'allotments': 2
|
|
}
|
|
|
|
# Central districts are noisier
|
|
central_districts = ['Mitte', 'Kreuzberg', 'Friedrichshain']
|
|
district_modifier = 1 if district in central_districts else 0
|
|
|
|
return min(5, base_noise.get(fclass, 2) + district_modifier)
|
|
|
|
def _estimate_natural_surface(self, fclass: str) -> int:
|
|
"""Estimate percentage of natural surface."""
|
|
surface_map = {
|
|
'forest': 95, 'nature_reserve': 95, 'wood': 95,
|
|
'meadow': 95, 'grass': 90, 'heath': 90,
|
|
'park': 75, 'garden': 65, 'common': 80,
|
|
'recreation_ground': 60, 'playground': 40, 'pitch': 20,
|
|
'golf_course': 70, 'allotments': 85
|
|
}
|
|
return surface_map.get(fclass, 70)
|
|
|
|
def _estimate_playground_quality(self, fclass: str, tree_count: int) -> int:
|
|
"""Estimate playground quality score."""
|
|
base_scores = {
|
|
'playground': 85,
|
|
'park': 65,
|
|
'recreation_ground': 70,
|
|
'garden': 40,
|
|
'common': 50
|
|
}
|
|
|
|
base = base_scores.get(fclass, 25)
|
|
|
|
# Trees improve playground appeal for families
|
|
tree_bonus = min(15, tree_count // 5) # +3 per 5 trees, max 15
|
|
|
|
return min(100, base + tree_bonus)
|
|
|
|
async def process_all_green_spaces(self):
|
|
"""Process all green spaces with real data enhancement."""
|
|
print("Starting enhanced green space processing with real data...")
|
|
|
|
# Load OSM green space boundaries
|
|
osm_green_spaces = self.load_osm_green_spaces()
|
|
|
|
enhanced_green_spaces = []
|
|
|
|
print(f"Enhancing {len(osm_green_spaces)} green spaces with real tree and toilet data...")
|
|
|
|
# Process in batches to avoid overwhelming the system
|
|
batch_size = 50
|
|
total_processed = 0
|
|
|
|
for i in range(0, len(osm_green_spaces), batch_size):
|
|
batch = osm_green_spaces.iloc[i:i+batch_size]
|
|
batch_results = []
|
|
|
|
for idx, row in batch.iterrows():
|
|
result = await self.enhance_green_space_with_real_data(row)
|
|
if result:
|
|
batch_results.append(result)
|
|
|
|
total_processed += 1
|
|
if total_processed % 25 == 0:
|
|
print(f"Processed {total_processed}/{len(osm_green_spaces)} green spaces...")
|
|
|
|
enhanced_green_spaces.extend(batch_results)
|
|
|
|
# Small delay between batches
|
|
await asyncio.sleep(0.1)
|
|
|
|
print(f"Successfully enhanced {len(enhanced_green_spaces)} green spaces with real data")
|
|
return enhanced_green_spaces
|
|
|
|
def save_enhanced_data(self, enhanced_green_spaces: List[Dict]):
|
|
"""Save enhanced green spaces to JSON file."""
|
|
output_file = self.processed_dir / "real_berlin_green_spaces.json"
|
|
|
|
# Calculate summary statistics
|
|
spaces_with_trees = len([gs for gs in enhanced_green_spaces if gs["tree_data"]["total_trees"] > 0])
|
|
spaces_with_toilets = len([gs for gs in enhanced_green_spaces if gs["toilet_accessibility"]["nearby_toilets_count"] > 0])
|
|
total_trees = sum(gs["tree_data"]["total_trees"] for gs in enhanced_green_spaces)
|
|
avg_species_per_space = sum(gs["tree_data"]["species_count"] for gs in enhanced_green_spaces) / len(enhanced_green_spaces) if enhanced_green_spaces else 0
|
|
|
|
data = {
|
|
"green_spaces": enhanced_green_spaces,
|
|
"total_count": len(enhanced_green_spaces),
|
|
"last_updated": datetime.now().isoformat(),
|
|
"data_sources": [
|
|
"openstreetmap_boundaries",
|
|
"berlin_tree_cadastre_via_service",
|
|
"berlin_toilet_locations_via_service",
|
|
"berlin_districts"
|
|
],
|
|
"processing_info": {
|
|
"script_version": "1.0",
|
|
"coordinate_system": "WGS84",
|
|
"uses_existing_services": True,
|
|
"tree_analysis_via": "StreetTreeService",
|
|
"toilet_analysis_via": "BerlinDataService"
|
|
},
|
|
"summary_stats": {
|
|
"spaces_with_trees": spaces_with_trees,
|
|
"spaces_with_nearby_toilets": spaces_with_toilets,
|
|
"total_trees_in_all_spaces": total_trees,
|
|
"average_species_per_space": round(avg_species_per_space, 1),
|
|
"coverage_percentage": {
|
|
"with_tree_data": round((spaces_with_trees / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0,
|
|
"with_toilet_data": round((spaces_with_toilets / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0
|
|
}
|
|
}
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✅ Saved {len(enhanced_green_spaces)} enhanced green spaces to {output_file}")
|
|
print(f"📊 Summary:")
|
|
print(f" - {spaces_with_trees} spaces have tree data ({round((spaces_with_trees/len(enhanced_green_spaces))*100, 1)}%)")
|
|
print(f" - {spaces_with_toilets} spaces have nearby toilets ({round((spaces_with_toilets/len(enhanced_green_spaces))*100, 1)}%)")
|
|
print(f" - {total_trees} total trees analyzed")
|
|
print(f" - {avg_species_per_space:.1f} average species per space")
|
|
|
|
return output_file
|
|
|
|
|
|
async def main():
|
|
"""Main processing function."""
|
|
processor = RealDataGreenSpaceProcessor()
|
|
|
|
try:
|
|
# Process enhanced green spaces using existing services
|
|
enhanced_green_spaces = await processor.process_all_green_spaces()
|
|
|
|
# Save enhanced data
|
|
output_file = processor.save_enhanced_data(enhanced_green_spaces)
|
|
|
|
print(f"\n🎉 Successfully created real data enhanced Berlin green spaces!")
|
|
print(f"📁 Output: {output_file}")
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n⚠️ Processing interrupted by user")
|
|
except Exception as e:
|
|
print(f"❌ Error processing data: {e}")
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |