558 lines
21 KiB
Python
558 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Quick Berlin green spaces processor.
|
|
Pre-filters OSM data efficiently, then processes only the best candidates.
|
|
"""
|
|
|
|
import json
|
|
import asyncio
|
|
import xml.etree.ElementTree as ET
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import sys
|
|
import re
|
|
import math
|
|
# from tqdm.asyncio import tqdm # Not available, remove tqdm dependency
|
|
from xml.etree.ElementTree import iterparse
|
|
|
|
# Add the app directory to Python path
|
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
|
|
from app.services.street_tree_service import StreetTreeService
|
|
from app.services.berlin_data_service import BerlinDataService
|
|
|
|
|
|
def calculate_polygon_area_sqm(coords):
|
|
"""Calculate area of a polygon using the Shoelace formula."""
|
|
if len(coords) < 3:
|
|
return 5000 # Default for invalid polygons
|
|
|
|
# Convert to radians and use spherical approximation for Earth
|
|
def to_radians(deg):
|
|
return deg * math.pi / 180
|
|
|
|
# Use simple planar approximation for small areas
|
|
# Convert lat/lng to approximate meters (rough approximation for Berlin area)
|
|
lat_center = sum(lat for lat, lng in coords) / len(coords)
|
|
lng_center = sum(lng for lat, lng in coords) / len(coords)
|
|
|
|
# Approximate meters per degree at Berlin latitude
|
|
meters_per_lat = 111320 # roughly constant
|
|
meters_per_lng = 111320 * math.cos(to_radians(lat_center))
|
|
|
|
# Convert coordinates to meters relative to center
|
|
meter_coords = []
|
|
for lat, lng in coords:
|
|
x = (lng - lng_center) * meters_per_lng
|
|
y = (lat - lat_center) * meters_per_lat
|
|
meter_coords.append((x, y))
|
|
|
|
# Shoelace formula
|
|
area = 0
|
|
n = len(meter_coords)
|
|
for i in range(n):
|
|
j = (i + 1) % n
|
|
area += meter_coords[i][0] * meter_coords[j][1]
|
|
area -= meter_coords[j][0] * meter_coords[i][1]
|
|
|
|
area = abs(area) / 2
|
|
|
|
# Reasonable bounds check
|
|
if area < 100: # Too small
|
|
return 5000
|
|
elif area > 10000000: # Too large (10 km²)
|
|
return 500000 # Cap at reasonable park size
|
|
|
|
return int(area)
|
|
|
|
|
|
def calculate_search_radius(area_sqm):
|
|
"""Calculate appropriate tree search radius based on park area."""
|
|
if area_sqm < 10000: # < 1 hectare
|
|
return 150
|
|
elif area_sqm < 50000: # < 5 hectares
|
|
return 300
|
|
elif area_sqm < 200000: # < 20 hectares
|
|
return 500
|
|
else: # Large parks like Treptower Park
|
|
return 800
|
|
|
|
|
|
def calculate_enhanced_shade_quality(tree_response, area_sqm):
|
|
"""Calculate enhanced shade quality based on real tree characteristics."""
|
|
metrics = tree_response.metrics
|
|
shade_analysis = tree_response.shade_analysis
|
|
|
|
# Base score from tree density and coverage
|
|
base_score = 0
|
|
|
|
# Factor 1: Actual shade coverage (crown area based)
|
|
coverage = metrics.shade_coverage_percent or 0
|
|
if coverage >= 60:
|
|
base_score += 40
|
|
elif coverage >= 40:
|
|
base_score += 30
|
|
elif coverage >= 20:
|
|
base_score += 20
|
|
elif coverage >= 10:
|
|
base_score += 10
|
|
|
|
# Factor 2: Large mature trees (better shade)
|
|
large_trees = len(shade_analysis.nearby_large_trees or [])
|
|
if large_trees >= 10:
|
|
base_score += 25
|
|
elif large_trees >= 5:
|
|
base_score += 20
|
|
elif large_trees >= 3:
|
|
base_score += 15
|
|
elif large_trees >= 1:
|
|
base_score += 10
|
|
|
|
# Factor 3: Tree density per area
|
|
trees_per_hectare = metrics.trees_per_hectare or 0
|
|
if trees_per_hectare >= 50:
|
|
base_score += 20
|
|
elif trees_per_hectare >= 30:
|
|
base_score += 15
|
|
elif trees_per_hectare >= 20:
|
|
base_score += 10
|
|
elif trees_per_hectare >= 10:
|
|
base_score += 5
|
|
|
|
# Factor 4: Average tree height (taller = better shade)
|
|
avg_height = metrics.average_height or 0
|
|
if avg_height >= 20:
|
|
base_score += 10
|
|
elif avg_height >= 15:
|
|
base_score += 8
|
|
elif avg_height >= 10:
|
|
base_score += 5
|
|
elif avg_height >= 5:
|
|
base_score += 3
|
|
|
|
# Factor 5: Crown diameter quality
|
|
avg_crown = metrics.average_crown_diameter or 0
|
|
if avg_crown >= 12:
|
|
base_score += 5
|
|
elif avg_crown >= 8:
|
|
base_score += 3
|
|
elif avg_crown >= 5:
|
|
base_score += 1
|
|
|
|
return min(100, base_score)
|
|
|
|
|
|
def detect_water_features(candidate):
|
|
"""Detect water features using OSM tags and name analysis."""
|
|
tags = candidate.get('tags', {})
|
|
name = candidate.get('name', '').lower()
|
|
|
|
# Check OSM water-related tags
|
|
water_tags = ['water', 'waterway', 'natural']
|
|
has_water_tags = any(
|
|
tags.get(tag, '').lower() in ['water', 'lake', 'pond', 'reservoir', 'river', 'stream']
|
|
for tag in water_tags
|
|
)
|
|
|
|
# Check name for water indicators
|
|
water_names = ['see', 'teich', 'weiher', 'water', 'lake', 'pond', 'fluss', 'river', 'bach', 'creek']
|
|
has_water_name = any(water_word in name for water_word in water_names)
|
|
|
|
# Check for fountain/brunnen
|
|
fountain_indicators = ['brunnen', 'fountain', 'springbrunnen']
|
|
has_fountain = any(fountain in name for fountain in fountain_indicators)
|
|
|
|
return has_water_tags or has_water_name or has_fountain
|
|
|
|
|
|
def estimate_berlin_district(lat: float, lng: float) -> str:
|
|
"""Estimate Berlin district from coordinates using geographic boundaries."""
|
|
# Northern districts
|
|
if lat > 52.55:
|
|
if lng < 13.25:
|
|
return "Reinickendorf"
|
|
elif lng < 13.45:
|
|
return "Pankow"
|
|
else:
|
|
return "Lichtenberg"
|
|
# Central-north districts
|
|
elif lat > 52.52:
|
|
if lng < 13.20:
|
|
return "Spandau"
|
|
elif lng < 13.30:
|
|
return "Charlottenburg-Wilmersdorf"
|
|
elif lng < 13.42:
|
|
return "Mitte"
|
|
elif lng < 13.48:
|
|
return "Friedrichshain-Kreuzberg"
|
|
else:
|
|
return "Lichtenberg"
|
|
# Central districts
|
|
elif lat > 52.48:
|
|
if lng < 13.20:
|
|
return "Spandau"
|
|
elif lng < 13.30:
|
|
return "Charlottenburg-Wilmersdorf"
|
|
elif lng < 13.35:
|
|
return "Tempelhof-Schöneberg"
|
|
elif lng < 13.42:
|
|
return "Mitte"
|
|
elif lng < 13.48:
|
|
return "Friedrichshain-Kreuzberg"
|
|
else:
|
|
return "Lichtenberg"
|
|
# Southern-central districts
|
|
elif lat > 52.45:
|
|
if lng < 13.20:
|
|
return "Steglitz-Zehlendorf"
|
|
elif lng < 13.35:
|
|
return "Tempelhof-Schöneberg"
|
|
elif lng < 13.45:
|
|
return "Neukölln"
|
|
elif lng < 13.55:
|
|
return "Treptow-Köpenick"
|
|
else:
|
|
return "Marzahn-Hellersdorf"
|
|
# Southern districts
|
|
else:
|
|
if lng < 13.35:
|
|
return "Steglitz-Zehlendorf"
|
|
else:
|
|
return "Treptow-Köpenick"
|
|
|
|
|
|
def get_specific_neighborhood(district: str, lat: float, lng: float) -> str:
|
|
"""Get specific neighborhood within district based on coordinates."""
|
|
neighborhoods = {
|
|
"Mitte": {
|
|
(52.540, 52.560, 13.33, 13.38): "Wedding",
|
|
(52.515, 52.530, 13.33, 13.38): "Moabit",
|
|
(52.510, 52.520, 13.35, 13.38): "Tiergarten",
|
|
(52.525, 52.545, 13.40, 13.43): "Prenzlauer Berg"
|
|
},
|
|
"Charlottenburg-Wilmersdorf": {
|
|
(52.485, 52.505, 13.30, 13.33): "Wilmersdorf",
|
|
(52.505, 52.525, 13.25, 13.33): "Charlottenburg"
|
|
},
|
|
"Friedrichshain-Kreuzberg": {
|
|
(52.490, 52.510, 13.38, 13.42): "Kreuzberg",
|
|
(52.510, 52.525, 13.42, 13.48): "Friedrichshain"
|
|
},
|
|
"Tempelhof-Schöneberg": {
|
|
(52.480, 52.500, 13.33, 13.37): "Schöneberg",
|
|
(52.460, 52.480, 13.37, 13.42): "Tempelhof"
|
|
},
|
|
"Steglitz-Zehlendorf": {
|
|
(52.430, 52.450, 13.23, 13.30): "Zehlendorf",
|
|
(52.450, 52.470, 13.30, 13.35): "Steglitz"
|
|
},
|
|
"Treptow-Köpenick": {
|
|
(52.430, 52.460, 13.55, 13.65): "Köpenick",
|
|
(52.480, 52.500, 13.45, 13.50): "Treptow"
|
|
}
|
|
}
|
|
|
|
if district in neighborhoods:
|
|
for (min_lat, max_lat, min_lng, max_lng), neighborhood in neighborhoods[district].items():
|
|
if min_lat <= lat <= max_lat and min_lng <= lng <= max_lng:
|
|
return neighborhood
|
|
|
|
return district
|
|
|
|
|
|
async def quick_process():
|
|
"""Quick processing of significant Berlin green spaces."""
|
|
print("🚀 Quick Berlin Green Spaces Processor")
|
|
print("=" * 45)
|
|
|
|
# Initialize services
|
|
tree_service = StreetTreeService()
|
|
berlin_data = BerlinDataService()
|
|
|
|
# Pre-load and index trees once to avoid repeated indexing
|
|
print("🔄 Pre-loading tree data and building spatial index...")
|
|
await tree_service._load_trees()
|
|
|
|
osm_file = Path("app/data/osm-raw/berlin_green_spaces.osm")
|
|
|
|
if not osm_file.exists():
|
|
print("❌ OSM file not found. Please ensure data is downloaded.")
|
|
return
|
|
|
|
print("🔍 Quick filtering for named parks and significant areas...")
|
|
print(f"📁 OSM file size: {osm_file.stat().st_size / (1024*1024):.1f} MB")
|
|
|
|
# Quick scan for good candidates
|
|
candidates = []
|
|
|
|
try:
|
|
processed = 0
|
|
|
|
print("🔍 Single-pass XML parsing - ways with embedded coordinates...")
|
|
|
|
# Single pass: parse ways with embedded coordinates
|
|
ways_processed = 0
|
|
current_way_tags = {}
|
|
current_way_coordinates = []
|
|
in_way = False
|
|
|
|
for event, elem in iterparse(osm_file, events=('start', 'end')):
|
|
if event == 'start':
|
|
if elem.tag == 'way':
|
|
in_way = True
|
|
current_way_tags = {}
|
|
current_way_coordinates = []
|
|
ways_processed += 1
|
|
if ways_processed % 1000 == 0:
|
|
print(f"Processed {ways_processed} ways, found {len(candidates)} candidates so far...")
|
|
elif in_way and elem.tag == 'tag':
|
|
k = elem.get('k')
|
|
v = elem.get('v')
|
|
if k and v:
|
|
current_way_tags[k] = v
|
|
elif in_way and elem.tag == 'nd':
|
|
# Extract coordinates directly from nd element
|
|
lat = elem.get('lat')
|
|
lon = elem.get('lon')
|
|
if lat and lon:
|
|
current_way_coordinates.append((float(lat), float(lon)))
|
|
continue
|
|
|
|
if elem.tag == 'way' and in_way:
|
|
in_way = False
|
|
tags = current_way_tags
|
|
coordinates = current_way_coordinates
|
|
|
|
# Quick filters for promising spaces - be more lenient
|
|
has_name = 'name' in tags
|
|
is_park = (tags.get('leisure') in ['park', 'garden', 'nature_reserve'] or
|
|
tags.get('landuse') in ['forest', 'grass', 'recreation_ground'])
|
|
|
|
# Also accept common green space tags
|
|
has_green_tags = any(key in tags for key in ['leisure', 'landuse', 'natural', 'amenity'])
|
|
|
|
if not (has_name or is_park or has_green_tags):
|
|
elem.clear() # Free memory
|
|
continue
|
|
|
|
# Use embedded coordinates directly
|
|
if not coordinates:
|
|
elem.clear() # Free memory
|
|
continue
|
|
|
|
# Get center coordinate and all coordinates for area calculation
|
|
lat, lng = coordinates[0] if len(coordinates) == 1 else (
|
|
sum(lat for lat, lng in coordinates) / len(coordinates),
|
|
sum(lng for lat, lng in coordinates) / len(coordinates)
|
|
)
|
|
|
|
# Basic Berlin bounds check
|
|
if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8):
|
|
elem.clear() # Free memory
|
|
continue
|
|
|
|
name = tags.get('name', f"Unnamed {tags.get('leisure', tags.get('landuse', 'area'))}")
|
|
space_type = tags.get('leisure') or tags.get('landuse') or 'park'
|
|
|
|
candidate = {
|
|
'id': f"quick_{elem.get('id')}",
|
|
'name': name,
|
|
'type': space_type,
|
|
'lat': lat,
|
|
'lng': lng,
|
|
'has_name': has_name,
|
|
'tags': tags,
|
|
'coordinates': coordinates # Store all coordinates for area calculation
|
|
}
|
|
|
|
candidates.append(candidate)
|
|
processed += 1
|
|
|
|
# Limit for quick processing
|
|
if len(candidates) >= 100:
|
|
elem.clear() # Free memory
|
|
break
|
|
|
|
elem.clear() # Free memory
|
|
else:
|
|
elem.clear() # Free memory
|
|
|
|
print(f"✅ Found {len(candidates)} promising green spaces")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error in quick filtering: {e}")
|
|
return
|
|
|
|
if not candidates:
|
|
print("No candidates found")
|
|
return
|
|
|
|
# Sort by having names (better quality)
|
|
candidates.sort(key=lambda x: x['has_name'], reverse=True)
|
|
|
|
print(f"\n🔧 Enhancing top {len(candidates)} spaces with real data...")
|
|
|
|
# Process candidates in parallel with batching
|
|
batch_size = 10 # Process 10 candidates at a time
|
|
enhanced_spaces = []
|
|
|
|
async def process_candidate(candidate):
|
|
"""Process a single candidate with tree and toilet data."""
|
|
try:
|
|
# Calculate actual area from OSM polygon coordinates
|
|
area_sqm = calculate_polygon_area_sqm(candidate.get('coordinates', []))
|
|
search_radius = calculate_search_radius(area_sqm)
|
|
|
|
# Get real tree data and toilet data concurrently with dynamic radius
|
|
tree_task = tree_service.get_trees_near_location(
|
|
candidate['lat'], candidate['lng'], radius_m=search_radius
|
|
)
|
|
toilet_task = berlin_data.get_toilets_near_point(
|
|
candidate['lat'], candidate['lng'], 500
|
|
)
|
|
|
|
print(f"🔍 Getting data for {candidate['name'][:30]}... (area: {area_sqm/10000:.1f}ha, radius: {search_radius}m)")
|
|
tree_response, nearby_toilets = await asyncio.gather(tree_task, toilet_task)
|
|
|
|
# Create enhanced space
|
|
enhanced_space = {
|
|
"id": candidate['id'],
|
|
"name": candidate['name'],
|
|
"description": f"Berlin {candidate['type']} discovered via quick OSM processing",
|
|
"type": "PARK", # Simplified for now
|
|
"coordinates": {
|
|
"lat": candidate['lat'],
|
|
"lng": candidate['lng']
|
|
},
|
|
"neighborhood": get_specific_neighborhood(estimate_berlin_district(candidate['lat'], candidate['lng']), candidate['lat'], candidate['lng']),
|
|
"area_sqm": area_sqm, # Real calculated area
|
|
|
|
# Environmental features from real tree data
|
|
"environmental": {
|
|
"tree_coverage_percent": max(5, int(tree_response.metrics.shade_coverage_percent)), # Use actual crown area calculation
|
|
"shade_quality": calculate_enhanced_shade_quality(tree_response, area_sqm),
|
|
"noise_level": 2, # Default
|
|
"wildlife_diversity_score": tree_response.metrics.species_diversity_score,
|
|
"water_features": detect_water_features(candidate),
|
|
"natural_surface_percent": 80
|
|
},
|
|
|
|
# Real tree data
|
|
"tree_data": {
|
|
"total_trees": tree_response.metrics.total_trees,
|
|
"trees_per_hectare": tree_response.metrics.trees_per_hectare,
|
|
"species_count": len(tree_response.metrics.dominant_species),
|
|
"species_diversity_score": tree_response.metrics.species_diversity_score,
|
|
"mature_trees_count": tree_response.metrics.mature_trees_count,
|
|
"young_trees_count": tree_response.metrics.young_trees_count,
|
|
"average_tree_age": tree_response.metrics.average_tree_age,
|
|
"average_height": tree_response.metrics.average_height,
|
|
"average_crown_diameter": tree_response.metrics.average_crown_diameter,
|
|
"shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
|
|
"dominant_species": tree_response.metrics.dominant_species[:3]
|
|
},
|
|
|
|
# Real toilet data
|
|
"toilet_accessibility": {
|
|
"nearby_toilets_count": len(nearby_toilets),
|
|
"accessibility_score": 80 if nearby_toilets else 30,
|
|
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
|
|
"free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
|
|
"accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
|
|
},
|
|
|
|
# Standard features
|
|
"accessibility": {
|
|
"wheelchair_accessible": True,
|
|
"public_transport_score": 3,
|
|
"cycling_infrastructure": True,
|
|
"parking_availability": 2,
|
|
"lighting_quality": 3
|
|
},
|
|
|
|
"recreation": {
|
|
"playground_quality": 60 if candidate['type'] == 'park' else 30,
|
|
"sports_facilities": candidate['type'] == 'recreation_ground',
|
|
"running_paths": True,
|
|
"cycling_paths": True,
|
|
"dog_friendly": True,
|
|
"bbq_allowed": candidate['type'] in ['park', 'recreation_ground']
|
|
},
|
|
|
|
"osm_metadata": {
|
|
"has_official_name": candidate['has_name'],
|
|
"tags": candidate['tags'],
|
|
"source": "quick_osm_processing"
|
|
},
|
|
|
|
"last_updated": datetime.now().isoformat(),
|
|
"data_sources": ["quick_osm_scan", "berlin_tree_cadastre", "berlin_toilets"],
|
|
"confidence_score": 90 if candidate['has_name'] else 75
|
|
}
|
|
|
|
return enhanced_space, tree_response.metrics.total_trees, len(nearby_toilets)
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error processing {candidate['name']}: {e}")
|
|
return None, 0, 0
|
|
|
|
# Process candidates in batches with progress bar
|
|
for i in range(0, len(candidates), batch_size):
|
|
batch = candidates[i:i + batch_size]
|
|
print(f"Processing batch {i//batch_size + 1}/{(len(candidates) + batch_size - 1)//batch_size}")
|
|
|
|
# Process batch concurrently with progress bar
|
|
tasks = [process_candidate(candidate) for candidate in batch]
|
|
results = await asyncio.gather(*tasks)
|
|
|
|
# Collect results
|
|
for result, trees, toilets in results:
|
|
if result:
|
|
enhanced_spaces.append(result)
|
|
print(f"✅ {result['name'][:40]:40} - {trees:3d} trees, {toilets} toilets")
|
|
|
|
# Small delay between batches to be respectful to APIs
|
|
if i + batch_size < len(candidates):
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Save results
|
|
output_file = Path("app/data/processed/quick_berlin_green_spaces.json")
|
|
|
|
with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
|
|
with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
|
|
total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)
|
|
|
|
data = {
|
|
"green_spaces": enhanced_spaces,
|
|
"total_count": len(enhanced_spaces),
|
|
"last_updated": datetime.now().isoformat(),
|
|
"data_sources": ["quick_osm_processing", "berlin_tree_cadastre", "berlin_toilets"],
|
|
"processing_info": {
|
|
"method": "quick_scan_for_named_and_significant_spaces",
|
|
"prioritizes_named_spaces": True,
|
|
"enhanced_with_real_berlin_data": True
|
|
},
|
|
"summary_stats": {
|
|
"total_spaces": len(enhanced_spaces),
|
|
"spaces_with_tree_data": with_trees,
|
|
"spaces_with_toilet_data": with_toilets,
|
|
"total_trees_analyzed": total_trees,
|
|
"tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%",
|
|
"toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%"
|
|
}
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n🎉 Quick processing complete!")
|
|
print(f"📁 Saved: {output_file}")
|
|
print(f"📊 {len(enhanced_spaces)} spaces enhanced")
|
|
print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data")
|
|
print(f"🌿 {total_trees} total trees analyzed")
|
|
print(f"\n✨ Ready to use! This gives you real Berlin green spaces")
|
|
print(f" with actual tree and toilet data for personality scoring!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(quick_process()) |