Add real green space data and neighborhood filtering

This commit is contained in:
Gal 2025-06-21 22:58:50 +02:00
parent c14f5ead38
commit 49e3d8c29d
Signed by: gal
GPG Key ID: F035BC65003BC00B
10 changed files with 934901 additions and 181 deletions

File diff suppressed because it is too large Load Diff

View File

@ -185,3 +185,142 @@ async def get_current_conditions(
return conditions
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to get conditions: {str(e)}")
@router.get("/all")
async def get_all_green_spaces(
personality: Optional[PersonalityType] = Query(None, description="Personality type for scoring"),
min_score: int = Query(0, ge=0, le=100, description="Minimum personality score (only applies if personality is provided)"),
limit: int = Query(50, ge=1, le=200, description="Maximum results"),
):
"""
Get all available green spaces in Berlin.
Optionally score them for a specific personality type.
Perfect for frontend dropdowns or full dataset access.
"""
try:
# Get all green spaces
all_spaces = await berlin_data.search_green_spaces()
# If personality is specified, score and filter
if personality:
scored_spaces = []
for space in all_spaces:
personality_score = await green_space_service.scoring_engine.score_green_space(
space, personality.value
)
if personality_score.score >= min_score:
space.current_personality_score = personality_score
scored_spaces.append(space)
# Sort by score (highest first)
scored_spaces.sort(
key=lambda x: x.current_personality_score.score if x.current_personality_score else 0,
reverse=True
)
all_spaces = scored_spaces
# Apply limit
limited_spaces = all_spaces[:limit]
return {
"green_spaces": limited_spaces,
"total_available": len(all_spaces),
"returned_count": len(limited_spaces),
"personality": personality.value if personality else None,
"min_score_applied": min_score if personality else None
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to get green spaces: {str(e)}")
@router.get("/recommendations/{personality}")
async def get_personality_recommendations(
personality: PersonalityType,
limit: int = Query(20, ge=1, le=50, description="Number of recommendations"),
neighborhood: Optional[str] = Query(None, description="Preferred neighborhood"),
min_score: int = Query(70, ge=50, le=100, description="Minimum personality score"),
):
"""
Get personalized green space recommendations.
Returns the best green spaces for a specific personality type,
with explanations of why each space is recommended.
"""
try:
# Get all green spaces
all_spaces = await berlin_data.search_green_spaces(neighborhood=neighborhood)
# Score and rank for personality
recommendations = []
for space in all_spaces:
personality_score = await green_space_service.scoring_engine.score_green_space(
space, personality.value
)
if personality_score.score >= min_score:
space.current_personality_score = personality_score
# Get additional insights
best_features = []
if space.environmental.tree_coverage_percent > 70:
best_features.append("Excellent tree coverage")
if space.environmental.water_features:
best_features.append("Water features")
if space.recreation.playground_quality > 60:
best_features.append("Good playground facilities")
if space.recreation.sports_facilities:
best_features.append("Sports facilities")
if space.environmental.noise_level.value <= 2:
best_features.append("Peaceful atmosphere")
recommendation = {
"green_space": space,
"score": personality_score.score,
"explanation": personality_score.explanation,
"best_features": best_features[:3], # Top 3 features
"visit_recommendation": _get_visit_recommendation(space, personality.value)
}
recommendations.append(recommendation)
# Sort by score
recommendations.sort(key=lambda x: x["score"], reverse=True)
return {
"recommendations": recommendations[:limit],
"personality": personality.value,
"total_matches": len(recommendations),
"search_filters": {
"neighborhood": neighborhood,
"min_score": min_score
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to get recommendations: {str(e)}")
def _get_visit_recommendation(space, personality: str) -> str:
"""Get a personalized visit recommendation"""
if personality == "little_adventurers":
if space.recreation.playground_quality > 60:
return "Perfect for family adventures with great playground facilities"
return "Great for exploring with kids"
elif personality == "date_night":
if space.environmental.noise_level.value <= 2:
return "Romantic and peaceful setting for couples"
return "Nice atmosphere for a romantic stroll"
elif personality == "zen_masters":
if space.environmental.tree_coverage_percent > 70:
return "Ideal for peaceful meditation under the trees"
return "Perfect for quiet contemplation"
elif personality == "active_lifestyle":
if space.recreation.sports_facilities:
return "Great for workouts and active recreation"
return "Perfect for running and outdoor activities"
elif personality == "wildlife_lover":
if space.environmental.wildlife_diversity_score > 70:
return "Excellent biodiversity for nature observation"
return "Good spot for wildlife watching"
else:
return "Highly recommended for your personality type"

View File

@ -20,6 +20,7 @@ class BerlinDataService:
self.cache = {}
self.last_refresh = None
self._toilets_cache = None
self._green_spaces_cache = None
self._street_trees_index = None
self.data_dir = Path("app/data")
self.street_tree_service = StreetTreeService()
@ -46,8 +47,15 @@ class BerlinDataService:
if distance > radius:
continue
# Apply neighborhood filter
if neighborhood and space.neighborhood.lower() != neighborhood.lower():
# Apply neighborhood filter with flexible matching
if neighborhood:
neighborhood_lower = neighborhood.lower()
space_neighborhood_lower = space.neighborhood.lower()
# Check for exact match or partial match (useful for compound neighborhood names)
if (neighborhood_lower != space_neighborhood_lower and
neighborhood_lower not in space_neighborhood_lower and
space_neighborhood_lower not in neighborhood_lower):
continue
# Apply other filters
@ -214,43 +222,60 @@ class BerlinDataService:
async def get_neighborhood_stats(self) -> Dict[str, Any]:
"""Get statistics for Berlin neighborhoods."""
return {
"neighborhoods": [
{
"name": "mitte",
"display_name": "Mitte",
"green_space_count": 15,
# Get all green spaces to calculate real neighborhood stats
green_spaces = await self._get_mock_green_spaces()
# Count green spaces per neighborhood
neighborhood_counts = {}
neighborhood_spaces = {}
for space in green_spaces:
neighborhood = space.neighborhood
if neighborhood not in neighborhood_counts:
neighborhood_counts[neighborhood] = 0
neighborhood_spaces[neighborhood] = []
neighborhood_counts[neighborhood] += 1
neighborhood_spaces[neighborhood].append(space)
# Generate neighborhood stats
neighborhoods = []
for neighborhood, count in neighborhood_counts.items():
# Calculate average personality scores based on green space features
spaces = neighborhood_spaces[neighborhood]
# Calculate scores based on actual features
total_tree_coverage = sum(s.environmental.tree_coverage_percent for s in spaces)
total_playgrounds = sum(s.recreation.playground_quality for s in spaces)
total_water_features = sum(1 for s in spaces if s.environmental.water_features)
total_sports = sum(1 for s in spaces if s.recreation.sports_facilities)
avg_tree_coverage = total_tree_coverage / count if count > 0 else 0
avg_playground = total_playgrounds / count if count > 0 else 0
water_ratio = total_water_features / count if count > 0 else 0
sports_ratio = total_sports / count if count > 0 else 0
# Calculate personality scores based on features
little_adventurers = min(100, int(avg_playground * 0.8 + sports_ratio * 30 + 40))
date_night = min(100, int(avg_tree_coverage * 0.6 + water_ratio * 25 + 45))
squad_goals = min(100, int(sports_ratio * 40 + avg_tree_coverage * 0.4 + 35))
zen_masters = min(100, int(avg_tree_coverage * 0.7 + water_ratio * 20 + 30))
neighborhoods.append({
"name": neighborhood.lower().replace(' ', '_').replace('-', '_'),
"display_name": neighborhood,
"green_space_count": count,
"avg_personality_scores": {
"little_adventurers": 75,
"date_night": 80,
"squad_goals": 70,
"zen_masters": 65
}
},
{
"name": "kreuzberg",
"display_name": "Kreuzberg",
"green_space_count": 12,
"avg_personality_scores": {
"little_adventurers": 70,
"date_night": 75,
"squad_goals": 85,
"zen_masters": 60
}
},
{
"name": "prenzlauer_berg",
"display_name": "Prenzlauer Berg",
"green_space_count": 18,
"avg_personality_scores": {
"little_adventurers": 90,
"date_night": 70,
"squad_goals": 75,
"zen_masters": 70
}
}
]
"little_adventurers": little_adventurers,
"date_night": date_night,
"squad_goals": squad_goals,
"zen_masters": zen_masters
}
})
# Sort by green space count (most spaces first)
neighborhoods.sort(key=lambda x: x["green_space_count"], reverse=True)
return {"neighborhoods": neighborhoods}
async def get_current_conditions(self, lat: float, lng: float) -> Dict[str, Any]:
"""Get current conditions at a location."""
@ -394,122 +419,76 @@ class BerlinDataService:
# Return original space if enhancement fails
return green_space
async def _get_mock_green_spaces(self) -> List[GreenSpace]:
"""Get mock green spaces data for development."""
# This would be replaced with real data fetching in production
return [
GreenSpace(
id="tiergarten_1",
name="Tiergarten",
description="Berlin's most famous park in the heart of the city",
type=GreenSpaceType.PARK,
coordinates=Coordinates(lat=52.5145, lng=13.3501),
neighborhood="Mitte",
address="Tiergarten, 10557 Berlin",
area_sqm=210000,
perimeter_m=5800,
def _load_green_spaces(self) -> List[Dict]:
"""Load green spaces data from JSON file"""
if self._green_spaces_cache is None:
green_spaces_file = self.data_dir / "processed" / "quick_berlin_green_spaces.json"
if green_spaces_file.exists():
with open(green_spaces_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self._green_spaces_cache = data.get("green_spaces", [])
else:
print("Warning: quick_berlin_green_spaces.json not found.")
self._green_spaces_cache = []
return self._green_spaces_cache
def _convert_json_to_green_space(self, json_data: Dict) -> GreenSpace:
"""Convert JSON data to GreenSpace model"""
from datetime import datetime
return GreenSpace(
id=json_data.get("id", ""),
name=json_data.get("name", ""),
description=json_data.get("description", ""),
type=GreenSpaceType.PARK, # Default to PARK, could be enhanced later
coordinates=Coordinates(
lat=json_data.get("coordinates", {}).get("lat", 0.0),
lng=json_data.get("coordinates", {}).get("lng", 0.0)
),
neighborhood=json_data.get("neighborhood", "Berlin"),
address=f"{json_data.get('name', 'Unknown')}, Berlin",
area_sqm=json_data.get("area_sqm", 0),
perimeter_m=json_data.get("perimeter_m", None),
environmental=EnvironmentalFeatures(
tree_coverage_percent=85,
shade_quality=90,
noise_level=NoiseLevel.MODERATE,
wildlife_diversity_score=80,
water_features=True,
natural_surface_percent=95
tree_coverage_percent=json_data.get("environmental", {}).get("tree_coverage_percent", 0),
shade_quality=json_data.get("environmental", {}).get("shade_quality", 0),
noise_level=NoiseLevel(json_data.get("environmental", {}).get("noise_level", 1)),
wildlife_diversity_score=json_data.get("environmental", {}).get("wildlife_diversity_score", 0),
water_features=json_data.get("environmental", {}).get("water_features", False),
natural_surface_percent=json_data.get("environmental", {}).get("natural_surface_percent", 0)
),
accessibility=AccessibilityFeatures(
wheelchair_accessible=True,
public_transport_score=5,
cycling_infrastructure=True,
parking_availability=3,
lighting_quality=4
wheelchair_accessible=json_data.get("accessibility", {}).get("wheelchair_accessible", True),
public_transport_score=json_data.get("accessibility", {}).get("public_transport_score", 3),
cycling_infrastructure=json_data.get("accessibility", {}).get("cycling_infrastructure", True),
parking_availability=json_data.get("accessibility", {}).get("parking_availability", 2),
lighting_quality=json_data.get("accessibility", {}).get("lighting_quality", 3)
),
recreation=RecreationFeatures(
playground_quality=70,
sports_facilities=True,
running_paths=True,
cycling_paths=True,
dog_friendly=True,
bbq_allowed=False
playground_quality=json_data.get("recreation", {}).get("playground_quality", 0),
sports_facilities=json_data.get("recreation", {}).get("sports_facilities", False),
running_paths=json_data.get("recreation", {}).get("running_paths", True),
cycling_paths=json_data.get("recreation", {}).get("cycling_paths", True),
dog_friendly=json_data.get("recreation", {}).get("dog_friendly", True),
bbq_allowed=json_data.get("recreation", {}).get("bbq_allowed", False)
),
nearby_amenities=[],
last_updated=datetime.now(),
data_sources=["berlin_open_data", "osm"],
confidence_score=95
),
GreenSpace(
id="volkspark_friedrichshain",
name="Volkspark Friedrichshain",
description="Historic park with fairy tale fountain and sports facilities",
type=GreenSpaceType.PARK,
coordinates=Coordinates(lat=52.5263, lng=13.4317),
neighborhood="Friedrichshain",
address="Friedrichshain, 10249 Berlin",
area_sqm=49000,
perimeter_m=2800,
environmental=EnvironmentalFeatures(
tree_coverage_percent=70,
shade_quality=75,
noise_level=NoiseLevel.QUIET,
wildlife_diversity_score=65,
water_features=True,
natural_surface_percent=80
),
accessibility=AccessibilityFeatures(
wheelchair_accessible=True,
public_transport_score=4,
cycling_infrastructure=True,
parking_availability=2,
lighting_quality=3
),
recreation=RecreationFeatures(
playground_quality=85,
sports_facilities=True,
running_paths=True,
cycling_paths=True,
dog_friendly=True,
bbq_allowed=True
),
nearby_amenities=[],
last_updated=datetime.now(),
data_sources=["berlin_open_data", "osm"],
confidence_score=90
),
GreenSpace(
id="tempelhofer_feld",
name="Tempelhofer Feld",
description="Former airport turned into unique urban park",
type=GreenSpaceType.PARK,
coordinates=Coordinates(lat=52.4732, lng=13.4015),
neighborhood="Tempelhof",
address="Tempelhofer Damm, 12101 Berlin",
area_sqm=300000,
perimeter_m=6200,
environmental=EnvironmentalFeatures(
tree_coverage_percent=15,
shade_quality=20,
noise_level=NoiseLevel.MODERATE,
wildlife_diversity_score=40,
water_features=False,
natural_surface_percent=60
),
accessibility=AccessibilityFeatures(
wheelchair_accessible=True,
public_transport_score=4,
cycling_infrastructure=True,
parking_availability=4,
lighting_quality=2
),
recreation=RecreationFeatures(
playground_quality=30,
sports_facilities=False,
running_paths=True,
cycling_paths=True,
dog_friendly=True,
bbq_allowed=True
),
nearby_amenities=[],
last_updated=datetime.now(),
data_sources=["berlin_open_data", "osm"],
confidence_score=85
last_updated=datetime.fromisoformat(json_data.get("last_updated", datetime.now().isoformat())),
data_sources=json_data.get("data_sources", []),
confidence_score=json_data.get("confidence_score", 85)
)
]
async def _get_mock_green_spaces(self) -> List[GreenSpace]:
"""Get green spaces data from JSON file."""
json_data = self._load_green_spaces()
green_spaces = []
for space_data in json_data:
try:
green_space = self._convert_json_to_green_space(space_data)
green_spaces.append(green_space)
except Exception as e:
print(f"Error converting green space {space_data.get('id', 'unknown')}: {e}")
continue
return green_spaces

View File

@ -4,6 +4,10 @@ from pathlib import Path
from typing import List, Optional, Tuple, Dict, Any
from datetime import datetime
from geopy.distance import geodesic
from rtree import index
import asyncio
import aiofiles
from functools import lru_cache
from app.models.street_tree import (
StreetTree, TreeDensityMetrics, TreeShadeAnalysis, TreesSearchFilters,
@ -14,24 +18,58 @@ from app.models.green_space import Coordinates
class StreetTreeService:
"""Service for accessing and analyzing Berlin street trees data."""
def __init__(self):
self._trees_cache = None
self._trees_index = None
self.data_dir = Path("app/data")
_instance = None
_initialized = False
def _load_trees(self) -> List[Dict]:
"""Load street trees data from JSON file."""
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self):
if not self._initialized:
self._trees_cache = None
self._spatial_index = None
self._tree_id_to_data = {}
self.data_dir = Path("app/data")
self.__class__._initialized = True
async def _load_trees(self) -> List[Dict]:
"""Load street trees data from JSON file and build spatial index."""
if self._trees_cache is None:
trees_file = self.data_dir / "processed" / "street_trees.json"
if trees_file.exists():
with open(trees_file, 'r', encoding='utf-8') as f:
data = json.load(f)
print("🔄 Loading trees data and building spatial index...")
async with aiofiles.open(trees_file, 'r', encoding='utf-8') as f:
content = await f.read()
data = json.loads(content)
self._trees_cache = data.get("street_trees", [])
await self._build_spatial_index()
print(f"✅ Loaded {len(self._trees_cache)} trees with spatial index")
else:
print("Warning: street_trees.json not found. Run process_street_trees.py first.")
self._trees_cache = []
return self._trees_cache
async def _build_spatial_index(self):
"""Build R-tree spatial index for fast location queries."""
if self._spatial_index is None and self._trees_cache:
print("🔨 Building spatial index...")
self._spatial_index = index.Index()
self._tree_id_to_data = {}
for i, tree_data in enumerate(self._trees_cache):
lat = tree_data.get('lat')
lng = tree_data.get('lng')
if lat is not None and lng is not None:
# R-tree expects (minx, miny, maxx, maxy)
bbox = (lng, lat, lng, lat)
self._spatial_index.insert(i, bbox)
self._tree_id_to_data[i] = tree_data
print(f"✅ Spatial index built for {len(self._tree_id_to_data)} trees")
def _create_tree_from_dict(self, tree_data: Dict) -> StreetTree:
"""Convert tree dictionary to StreetTree model."""
@ -94,6 +132,11 @@ class StreetTreeService:
last_updated=datetime.now()
)
@lru_cache(maxsize=1000)
def _distance_cache(self, lat1: float, lng1: float, lat2: float, lng2: float) -> float:
"""Cache distance calculations."""
return geodesic((lat1, lng1), (lat2, lng2)).meters
async def get_trees_near_location(
self,
lat: float,
@ -101,31 +144,48 @@ class StreetTreeService:
radius_m: int = 500,
limit: Optional[int] = None
) -> TreesNearLocationResponse:
"""Get street trees within a radius of a location."""
"""Get street trees within a radius of a location using spatial index."""
start_time = datetime.now()
trees_data = self._load_trees()
await self._load_trees()
nearby_trees = []
for tree_data in trees_data:
if self._spatial_index is None:
# Fallback to linear search if index failed
return await self._get_trees_linear_search(lat, lng, radius_m, limit)
# Convert radius to approximate bounding box for R-tree query
# Rough approximation: 1 degree ≈ 111km
radius_deg = radius_m / 111000
bbox = (lng - radius_deg, lat - radius_deg, lng + radius_deg, lat + radius_deg)
# Query spatial index for candidates
candidate_ids = list(self._spatial_index.intersection(bbox))
# Filter candidates by exact distance
tree_distances = []
for tree_id in candidate_ids:
tree_data = self._tree_id_to_data.get(tree_id)
if not tree_data:
continue
tree_lat = tree_data.get('lat')
tree_lng = tree_data.get('lng')
if tree_lat is None or tree_lng is None:
continue
distance = geodesic((lat, lng), (tree_lat, tree_lng)).meters
distance = self._distance_cache(lat, lng, tree_lat, tree_lng)
if distance <= radius_m:
tree = self._create_tree_from_dict(tree_data)
nearby_trees.append(tree)
tree_distances.append((tree, distance))
if limit and len(nearby_trees) >= limit:
if limit and len(tree_distances) >= limit:
break
# Sort by distance
nearby_trees.sort(
key=lambda t: geodesic((lat, lng), (t.coordinates.lat, t.coordinates.lng)).meters
)
tree_distances.sort(key=lambda x: x[1])
nearby_trees = [tree for tree, _ in tree_distances]
# Calculate metrics
metrics = self._calculate_tree_density_metrics(nearby_trees, radius_m)
@ -212,7 +272,7 @@ class StreetTreeService:
large_trees = []
for tree in trees:
distance = geodesic((lat, lng), (tree.coordinates.lat, tree.coordinates.lng)).meters
distance = self._distance_cache(lat, lng, tree.coordinates.lat, tree.coordinates.lng)
if distance <= 50:
trees_50m += 1
@ -259,9 +319,58 @@ class StreetTreeService:
canopy_density=len(large_trees) / max(1, len(trees)) if trees else 0
)
async def _get_trees_linear_search(
self,
lat: float,
lng: float,
radius_m: int = 500,
limit: Optional[int] = None
) -> TreesNearLocationResponse:
"""Fallback linear search method."""
start_time = datetime.now()
trees_data = await self._load_trees()
nearby_trees = []
for tree_data in trees_data:
tree_lat = tree_data.get('lat')
tree_lng = tree_data.get('lng')
if tree_lat is None or tree_lng is None:
continue
distance = self._distance_cache(lat, lng, tree_lat, tree_lng)
if distance <= radius_m:
tree = self._create_tree_from_dict(tree_data)
nearby_trees.append(tree)
if limit and len(nearby_trees) >= limit:
break
# Sort by distance
nearby_trees.sort(
key=lambda t: self._distance_cache(lat, lng, t.coordinates.lat, t.coordinates.lng)
)
# Calculate metrics
metrics = self._calculate_tree_density_metrics(nearby_trees, radius_m)
shade_analysis = self._analyze_shade_coverage(lat, lng, nearby_trees)
query_time = (datetime.now() - start_time).total_seconds() * 1000
return TreesNearLocationResponse(
location=Coordinates(lat=lat, lng=lng),
radius_m=radius_m,
trees=nearby_trees,
metrics=metrics,
shade_analysis=shade_analysis,
total_found=len(nearby_trees),
query_time_ms=int(query_time)
)
async def search_trees(self, filters: TreesSearchFilters) -> List[StreetTree]:
"""Search trees with filters."""
trees_data = self._load_trees()
trees_data = await self._load_trees()
filtered_trees = []
for tree_data in trees_data:
@ -272,10 +381,10 @@ class StreetTreeService:
if tree_lat is None or tree_lng is None:
continue
distance = geodesic(
(filters.center_lat, filters.center_lng),
(tree_lat, tree_lng)
).meters
distance = self._distance_cache(
filters.center_lat, filters.center_lng,
tree_lat, tree_lng
)
if distance > filters.within_radius_m:
continue
@ -305,7 +414,7 @@ class StreetTreeService:
async def get_tree_stats(self) -> Dict[str, Any]:
"""Get overall statistics about Berlin street trees."""
trees_data = self._load_trees()
trees_data = await self._load_trees()
if not trees_data:
return {"error": "No tree data available"}

View File

@ -35,6 +35,8 @@ dependencies = [
"redis>=5.0.0",
"aiofiles>=23.2.0",
"openpyxl>=3.1.5",
"tqdm>=4.67.1",
"rtree>=1.4.0",
]
[project.optional-dependencies]

View File

@ -0,0 +1,467 @@
#!/usr/bin/env python3
"""
Enhanced Berlin green space processor using existing tree and toilet services.
Downloads OSM green space boundaries and enhances them with real data using existing services.
"""
import os
import json
import zipfile
import requests
import asyncio
from pathlib import Path
import geopandas as gpd
import pandas as pd
from datetime import datetime
from typing import List, Dict, Optional
import sys
# Add the app directory to Python path to import services
sys.path.append(str(Path(__file__).parent.parent))
from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService
class RealDataGreenSpaceProcessor:
def __init__(self, data_dir: str = "app/data"):
self.data_dir = Path(data_dir)
self.raw_dir = self.data_dir / "geo-raw"
self.processed_dir = self.data_dir / "processed"
# Create directories
self.raw_dir.mkdir(parents=True, exist_ok=True)
self.processed_dir.mkdir(parents=True, exist_ok=True)
# Initialize existing services
self.tree_service = StreetTreeService()
self.berlin_data = BerlinDataService()
def download_berlin_districts(self):
"""Download Berlin district boundaries."""
json_file = self.raw_dir / "bezirksgrenzen.geojson"
if json_file.exists():
print(f"Berlin district data already exists: {json_file}")
return json_file
link = "https://tsb-opendata.s3.eu-central-1.amazonaws.com/bezirksgrenzen/bezirksgrenzen.geojson"
print(f"Downloading Berlin district data from {link}")
try:
response = requests.get(link, timeout=30)
response.raise_for_status()
with open(json_file, 'wb') as f:
f.write(response.content)
print(f"Downloaded to {json_file}")
return json_file
except Exception as e:
print(f"Error downloading districts: {e}")
raise
def download_osm_data(self):
"""Download Berlin OpenStreetMap data."""
zip_file = self.raw_dir / "berlin_shapes.zip"
shp_dir = self.raw_dir / "berlin_shapes"
# Check if already extracted
required_files = ["gis_osm_landuse_a_free_1.shp", "gis_osm_natural_a_free_1.shp", "gis_osm_leisure_a_free_1.shp"]
if all((shp_dir / f).exists() for f in required_files):
print(f"Berlin OSM data already exists: {shp_dir}")
return shp_dir
if not zip_file.exists():
link = "https://download.geofabrik.de/europe/germany/berlin-latest-free.shp.zip"
print(f"Downloading Berlin OSM data from {link} (this may take several minutes...)")
try:
response = requests.get(link, stream=True, timeout=300) # 5 min timeout
response.raise_for_status()
with open(zip_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Download completed: {zip_file}")
except Exception as e:
print(f"Error downloading OSM data: {e}")
raise
print(f"Extracting Berlin OSM data to {shp_dir}")
try:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(shp_dir)
print(f"Extracted to {shp_dir}")
except Exception as e:
print(f"Error extracting OSM data: {e}")
raise
return shp_dir
def load_osm_green_spaces(self):
"""Load OSM green space polygons."""
print("Loading OSM green space boundaries...")
# Download required data
districts_file = self.download_berlin_districts()
shp_dir = self.download_osm_data()
# Load Berlin districts for clipping
districts = gpd.read_file(districts_file)
# Define green space categories we want
green_categories = {
'landuse': ['forest', 'grass', 'meadow', 'recreation_ground', 'village_green', 'allotments'],
'natural': ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood'],
'leisure': ['park', 'garden', 'nature_reserve', 'playground', 'pitch', 'common', 'golf_course']
}
all_green_spaces = []
# Process each category
for category, subcategories in green_categories.items():
shapefile = shp_dir / f"gis_osm_{category}_a_free_1.shp"
if not shapefile.exists():
print(f"Warning: {shapefile} not found, skipping")
continue
print(f"Processing {category} data...")
try:
gdf = gpd.read_file(shapefile)
# Filter to relevant subcategories
gdf_filtered = gdf[gdf['fclass'].isin(subcategories)].copy()
if len(gdf_filtered) == 0:
print(f"No {category} features found in subcategories")
continue
# Clip to Berlin boundaries
gdf_clipped = gpd.clip(gdf_filtered, districts)
# Calculate area and filter out very small areas (< 1000 sqm)
gdf_clipped['area_sqm'] = gdf_clipped.geometry.area
gdf_clipped = gdf_clipped[gdf_clipped['area_sqm'] >= 1000]
if len(gdf_clipped) > 0:
all_green_spaces.append(gdf_clipped)
print(f"Found {len(gdf_clipped)} {category} features")
except Exception as e:
print(f"Error processing {category}: {e}")
continue
if not all_green_spaces:
raise ValueError("No green space data found")
# Combine all green spaces
green_spaces = gpd.GeoDataFrame(pd.concat(all_green_spaces, ignore_index=True))
# Add district information
green_spaces = gpd.sjoin(green_spaces, districts[['Bezirk', 'geometry']], how='left')
# Calculate centroids for analysis
green_spaces['centroid'] = green_spaces.geometry.centroid
green_spaces['centroid_lat'] = green_spaces.centroid.y
green_spaces['centroid_lng'] = green_spaces.centroid.x
print(f"Total green spaces found: {len(green_spaces)}")
return green_spaces
async def enhance_green_space_with_real_data(self, row):
"""Enhance a single green space with real tree and toilet data."""
try:
lat = row['centroid_lat']
lng = row['centroid_lng']
area_sqm = int(row['area_sqm'])
# Use existing tree service to get real tree data
tree_response = await self.tree_service.get_trees_near_location(
lat, lng, radius_m=min(400, int((area_sqm ** 0.5) * 1.5)) # Adaptive radius
)
# Use existing toilet service to get real toilet data
nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 800)
# Calculate toilet accessibility score
toilet_score = self._score_toilet_accessibility(nearby_toilets)
# Map OSM type to our enum
space_type = self._map_osm_to_space_type(row.get('fclass', ''))
# Generate ID
space_id = f"real_{row.get('fclass', 'unknown')}_{row.name}"
# Create enhanced green space using real data
enhanced_space = {
"id": space_id,
"name": row.get('name') or f"{row.get('fclass', 'Green Space').title()} in {row.get('Bezirk', 'Berlin')}",
"description": f"Real Berlin {row.get('fclass', 'green space')} enhanced with tree and toilet data",
"type": space_type,
"coordinates": {
"lat": float(lat),
"lng": float(lng)
},
"neighborhood": row.get('Bezirk', 'Unknown'),
"area_sqm": area_sqm,
"perimeter_m": int(row.geometry.length) if hasattr(row.geometry, 'length') else 0,
# Environmental features using real tree data
"environmental": {
"tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
"shade_quality": tree_response.shade_analysis.shade_quality_score,
"noise_level": self._estimate_noise_level(row.get('fclass', ''), row.get('Bezirk', '')),
"wildlife_diversity_score": tree_response.metrics.species_diversity_score,
"water_features": 'water' in str(row.get('fclass', '')).lower() or 'river' in str(row.get('name', '')).lower(),
"natural_surface_percent": self._estimate_natural_surface(row.get('fclass', ''))
},
# Real tree metrics from existing service
"tree_data": {
"total_trees": tree_response.metrics.total_trees,
"trees_per_hectare": tree_response.metrics.trees_per_hectare,
"species_count": len(tree_response.metrics.dominant_species),
"species_diversity_score": tree_response.metrics.species_diversity_score,
"mature_trees_count": tree_response.metrics.mature_trees_count,
"young_trees_count": tree_response.metrics.young_trees_count,
"average_tree_age": tree_response.metrics.average_tree_age,
"average_height": tree_response.metrics.average_height,
"average_crown_diameter": tree_response.metrics.average_crown_diameter,
"shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
"dominant_species": tree_response.metrics.dominant_species
},
# Real toilet accessibility from existing service
"toilet_accessibility": {
"nearby_toilets_count": len(nearby_toilets),
"accessibility_score": toilet_score,
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
"free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
"accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
},
# Standard accessibility features
"accessibility": {
"wheelchair_accessible": True,
"public_transport_score": 3, # Could be enhanced with real transit data
"cycling_infrastructure": area_sqm > 5000,
"parking_availability": 2,
"lighting_quality": 2
},
# Recreation features based on OSM data and size
"recreation": {
"playground_quality": self._estimate_playground_quality(row.get('fclass', ''), tree_response.metrics.total_trees),
"sports_facilities": 'pitch' in str(row.get('fclass', '')).lower() or 'sport' in str(row.get('name', '')).lower(),
"running_paths": area_sqm > 8000,
"cycling_paths": area_sqm > 15000,
"dog_friendly": True,
"bbq_allowed": row.get('fclass') in ['park', 'recreation_ground'] and area_sqm > 5000
},
"last_updated": datetime.now().isoformat(),
"data_sources": ["openstreetmap", "berlin_tree_cadastre", "berlin_toilets"],
"confidence_score": 95
}
return enhanced_space
except Exception as e:
print(f"Error enhancing green space {row.name}: {e}")
return None
def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
"""Score toilet accessibility using existing toilet data."""
if not nearby_toilets:
return 20
nearest_distance = nearby_toilets[0]['distance_meters']
# Distance-based scoring
if nearest_distance <= 200:
score = 100
elif nearest_distance <= 400:
score = 80
elif nearest_distance <= 600:
score = 60
else:
score = 40
# Bonuses for quality
free_toilets = len([t for t in nearby_toilets if t.get('is_free', False)])
accessible_toilets = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
score += min(20, free_toilets * 5 + accessible_toilets * 3)
return min(100, score)
def _map_osm_to_space_type(self, fclass: str) -> str:
"""Map OSM feature class to green space types."""
mapping = {
'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN',
'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
'wood': 'FOREST', 'heath': 'HEATH', 'pitch': 'SPORTS_AREA',
'golf_course': 'SPORTS_AREA', 'common': 'PARK', 'village_green': 'GRASS',
'allotments': 'GARDEN'
}
return mapping.get(fclass, 'PARK')
def _estimate_noise_level(self, fclass: str, district: str) -> int:
"""Estimate noise level (1=very quiet, 5=very noisy)."""
base_noise = {
'forest': 1, 'nature_reserve': 1, 'wood': 1,
'meadow': 2, 'grass': 2, 'heath': 2,
'park': 2, 'garden': 2, 'common': 2,
'recreation_ground': 3, 'playground': 3, 'pitch': 3,
'golf_course': 2, 'allotments': 2
}
# Central districts are noisier
central_districts = ['Mitte', 'Kreuzberg', 'Friedrichshain']
district_modifier = 1 if district in central_districts else 0
return min(5, base_noise.get(fclass, 2) + district_modifier)
def _estimate_natural_surface(self, fclass: str) -> int:
"""Estimate percentage of natural surface."""
surface_map = {
'forest': 95, 'nature_reserve': 95, 'wood': 95,
'meadow': 95, 'grass': 90, 'heath': 90,
'park': 75, 'garden': 65, 'common': 80,
'recreation_ground': 60, 'playground': 40, 'pitch': 20,
'golf_course': 70, 'allotments': 85
}
return surface_map.get(fclass, 70)
def _estimate_playground_quality(self, fclass: str, tree_count: int) -> int:
"""Estimate playground quality score."""
base_scores = {
'playground': 85,
'park': 65,
'recreation_ground': 70,
'garden': 40,
'common': 50
}
base = base_scores.get(fclass, 25)
# Trees improve playground appeal for families
tree_bonus = min(15, tree_count // 5) # +3 per 5 trees, max 15
return min(100, base + tree_bonus)
async def process_all_green_spaces(self):
"""Process all green spaces with real data enhancement."""
print("Starting enhanced green space processing with real data...")
# Load OSM green space boundaries
osm_green_spaces = self.load_osm_green_spaces()
enhanced_green_spaces = []
print(f"Enhancing {len(osm_green_spaces)} green spaces with real tree and toilet data...")
# Process in batches to avoid overwhelming the system
batch_size = 50
total_processed = 0
for i in range(0, len(osm_green_spaces), batch_size):
batch = osm_green_spaces.iloc[i:i+batch_size]
batch_results = []
for idx, row in batch.iterrows():
result = await self.enhance_green_space_with_real_data(row)
if result:
batch_results.append(result)
total_processed += 1
if total_processed % 25 == 0:
print(f"Processed {total_processed}/{len(osm_green_spaces)} green spaces...")
enhanced_green_spaces.extend(batch_results)
# Small delay between batches
await asyncio.sleep(0.1)
print(f"Successfully enhanced {len(enhanced_green_spaces)} green spaces with real data")
return enhanced_green_spaces
def save_enhanced_data(self, enhanced_green_spaces: List[Dict]):
"""Save enhanced green spaces to JSON file."""
output_file = self.processed_dir / "real_berlin_green_spaces.json"
# Calculate summary statistics
spaces_with_trees = len([gs for gs in enhanced_green_spaces if gs["tree_data"]["total_trees"] > 0])
spaces_with_toilets = len([gs for gs in enhanced_green_spaces if gs["toilet_accessibility"]["nearby_toilets_count"] > 0])
total_trees = sum(gs["tree_data"]["total_trees"] for gs in enhanced_green_spaces)
avg_species_per_space = sum(gs["tree_data"]["species_count"] for gs in enhanced_green_spaces) / len(enhanced_green_spaces) if enhanced_green_spaces else 0
data = {
"green_spaces": enhanced_green_spaces,
"total_count": len(enhanced_green_spaces),
"last_updated": datetime.now().isoformat(),
"data_sources": [
"openstreetmap_boundaries",
"berlin_tree_cadastre_via_service",
"berlin_toilet_locations_via_service",
"berlin_districts"
],
"processing_info": {
"script_version": "1.0",
"coordinate_system": "WGS84",
"uses_existing_services": True,
"tree_analysis_via": "StreetTreeService",
"toilet_analysis_via": "BerlinDataService"
},
"summary_stats": {
"spaces_with_trees": spaces_with_trees,
"spaces_with_nearby_toilets": spaces_with_toilets,
"total_trees_in_all_spaces": total_trees,
"average_species_per_space": round(avg_species_per_space, 1),
"coverage_percentage": {
"with_tree_data": round((spaces_with_trees / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0,
"with_toilet_data": round((spaces_with_toilets / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0
}
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"✅ Saved {len(enhanced_green_spaces)} enhanced green spaces to {output_file}")
print(f"📊 Summary:")
print(f" - {spaces_with_trees} spaces have tree data ({round((spaces_with_trees/len(enhanced_green_spaces))*100, 1)}%)")
print(f" - {spaces_with_toilets} spaces have nearby toilets ({round((spaces_with_toilets/len(enhanced_green_spaces))*100, 1)}%)")
print(f" - {total_trees} total trees analyzed")
print(f" - {avg_species_per_space:.1f} average species per space")
return output_file
async def main():
"""Main processing function."""
processor = RealDataGreenSpaceProcessor()
try:
# Process enhanced green spaces using existing services
enhanced_green_spaces = await processor.process_all_green_spaces()
# Save enhanced data
output_file = processor.save_enhanced_data(enhanced_green_spaces)
print(f"\n🎉 Successfully created real data enhanced Berlin green spaces!")
print(f"📁 Output: {output_file}")
except KeyboardInterrupt:
print("\n⚠️ Processing interrupted by user")
except Exception as e:
print(f"❌ Error processing data: {e}")
raise
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,613 @@
#!/usr/bin/env python3
"""
Filtered OSM processor for significant Berlin green spaces.
Processes only meaningful green spaces (>1000 sqm) with real tree and toilet data.
"""
import json
import asyncio
import xml.etree.ElementTree as ET
from pathlib import Path
from datetime import datetime
import sys
import math
from typing import List, Dict, Optional, Tuple
# Add the app directory to Python path
sys.path.append(str(Path(__file__).parent.parent))
from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService
class FilteredOSMProcessor:
def __init__(self, data_dir: str = "app/data"):
self.data_dir = Path(data_dir)
self.osm_raw_dir = self.data_dir / "osm-raw"
self.processed_dir = self.data_dir / "processed"
# Initialize services
self.tree_service = StreetTreeService()
self.berlin_data = BerlinDataService()
# Berlin bounding box
self.berlin_bbox = {
'min_lat': 52.3370, 'max_lat': 52.6755,
'min_lon': 13.0882, 'max_lon': 13.7611
}
# Filtering criteria
self.min_area_sqm = 1000 # Minimum area to be considered significant
self.max_spaces = 800 # Maximum number of spaces to process
def parse_and_filter_osm_data(self) -> List[Dict]:
"""Parse OSM data and filter for significant green spaces."""
osm_file = self.osm_raw_dir / "berlin_green_spaces.osm"
if not osm_file.exists():
print(f"❌ OSM file not found: {osm_file}")
print("Please run the download first or ensure the file exists.")
return []
print(f"📂 Parsing OSM data from {osm_file}")
try:
tree = ET.parse(osm_file)
root = tree.getroot()
ways = root.findall('.//way')
print(f"📊 Found {len(ways)} total ways in OSM file")
print(f"🔍 Filtering for significant green spaces (≥{self.min_area_sqm} sqm)...")
filtered_spaces = []
processed_count = 0
for way in ways:
processed_count += 1
if processed_count % 5000 == 0:
print(f" Processed {processed_count}/{len(ways)} ways... Found {len(filtered_spaces)} significant spaces")
try:
space_data = self._process_osm_way(way, root)
if space_data and space_data['area_sqm'] >= self.min_area_sqm:
filtered_spaces.append(space_data)
# Stop if we have enough spaces
if len(filtered_spaces) >= self.max_spaces:
print(f"✅ Reached target of {self.max_spaces} significant spaces")
break
except Exception as e:
continue
# Sort by area (largest first) to prioritize important spaces
filtered_spaces.sort(key=lambda x: x['area_sqm'], reverse=True)
print(f"🎯 Filtered to {len(filtered_spaces)} significant green spaces")
print(f"📏 Area range: {filtered_spaces[-1]['area_sqm']:,} - {filtered_spaces[0]['area_sqm']:,} sqm")
return filtered_spaces
except Exception as e:
print(f"❌ Error parsing OSM file: {e}")
return []
def _process_osm_way(self, way, root) -> Optional[Dict]:
"""Process a single OSM way into green space format."""
# Get tags
tags = {}
for tag in way.findall('tag'):
tags[tag.get('k')] = tag.get('v')
# Check if it's a significant green space
green_space_type = self._get_green_space_type(tags)
if not green_space_type:
return None
# Skip certain types that are usually small or not parks
skip_types = ['grave_yard', 'cemetery', 'allotments']
if green_space_type in skip_types:
return None
# Get node references
nd_refs = [nd.get('ref') for nd in way.findall('nd')]
if len(nd_refs) < 3: # Need at least 3 points for an area
return None
# Find node coordinates
coordinates = []
for nd_ref in nd_refs:
node = root.find(f".//node[@id='{nd_ref}']")
if node is not None:
lat = float(node.get('lat'))
lon = float(node.get('lon'))
# Check if within Berlin bounds
if (self.berlin_bbox['min_lat'] <= lat <= self.berlin_bbox['max_lat'] and
self.berlin_bbox['min_lon'] <= lon <= self.berlin_bbox['max_lon']):
coordinates.append((lat, lon))
if len(coordinates) < 3:
return None
# Calculate centroid and area
centroid_lat, centroid_lon = self._calculate_centroid(coordinates)
area_sqm = self._calculate_area(coordinates)
# Skip if too small
if area_sqm < self.min_area_sqm:
return None
# Get name
name = tags.get('name')
if not name:
name = f"{green_space_type.title()} near {centroid_lat:.3f}, {centroid_lon:.3f}"
# Estimate district
district = self._estimate_district(centroid_lat, centroid_lon)
return {
'id': f"osm_way_{way.get('id')}",
'name': name,
'fclass': green_space_type,
'lat': centroid_lat,
'lng': centroid_lon,
'area_sqm': int(area_sqm),
'district': district,
'osm_tags': tags,
'osm_id': way.get('id'),
'has_name': bool(tags.get('name')) # Track if it has a real name
}
def _get_green_space_type(self, tags: Dict) -> Optional[str]:
"""Determine if tags represent a significant green space."""
# Prioritize leisure tags (usually parks)
leisure = tags.get('leisure', '')
if leisure in ['park', 'garden', 'nature_reserve', 'recreation_ground', 'playground', 'common']:
return leisure
# Check landuse tags
landuse = tags.get('landuse', '')
if landuse in ['forest', 'grass', 'meadow', 'recreation_ground', 'village_green']:
return landuse
# Check natural tags (forests, etc.)
natural = tags.get('natural', '')
if natural in ['forest', 'wood', 'heath']:
return natural
return None
def _calculate_centroid(self, coordinates: List[Tuple[float, float]]) -> Tuple[float, float]:
"""Calculate centroid of polygon."""
lat_sum = sum(coord[0] for coord in coordinates)
lon_sum = sum(coord[1] for coord in coordinates)
count = len(coordinates)
return lat_sum / count, lon_sum / count
def _calculate_area(self, coordinates: List[Tuple[float, float]]) -> float:
"""Calculate area using shoelace formula (approximate for Berlin)."""
if len(coordinates) < 3:
return 0
# Convert to approximate meters for Berlin latitude
lat_to_m = 111000 # meters per degree latitude
lon_to_m = 111000 * math.cos(math.radians(52.5)) # adjust for Berlin
# Convert to meters
coords_m = [(lat * lat_to_m, lon * lon_to_m) for lat, lon in coordinates]
# Shoelace formula
area = 0
n = len(coords_m)
for i in range(n):
j = (i + 1) % n
area += coords_m[i][0] * coords_m[j][1]
area -= coords_m[j][0] * coords_m[i][1]
return abs(area) / 2
def _estimate_district(self, lat: float, lng: float) -> str:
"""Estimate Berlin district from coordinates."""
# Simplified district boundaries
if lat > 52.55:
return "Pankow" if lng < 13.45 else "Lichtenberg"
elif lat > 52.52:
if lng < 13.25:
return "Charlottenburg-Wilmersdorf"
elif lng < 13.42:
return "Mitte"
else:
return "Friedrichshain-Kreuzberg"
elif lat > 52.45:
if lng < 13.25:
return "Steglitz-Zehlendorf"
elif lng < 13.42:
return "Tempelhof-Schöneberg"
else:
return "Neukölln"
else:
return "Treptow-Köpenick"
async def enhance_green_space_with_real_data(self, space_data: Dict):
"""Enhance green space with real tree and toilet data."""
try:
lat = space_data['lat']
lng = space_data['lng']
area_sqm = space_data['area_sqm']
# Adaptive radius based on space size
radius = min(400, max(150, int((area_sqm ** 0.5) * 0.8)))
# Get real data using existing services
tree_response = await self.tree_service.get_trees_near_location(
lat, lng, radius_m=radius
)
nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 600)
# Calculate scores
toilet_score = self._score_toilet_accessibility(nearby_toilets)
space_type = self._map_to_space_type(space_data.get('fclass', ''))
enhanced_space = {
"id": space_data['id'],
"name": space_data['name'],
"description": f"Significant Berlin {space_data.get('fclass', 'green space')} from OSM data",
"type": space_type,
"coordinates": {
"lat": float(lat),
"lng": float(lng)
},
"neighborhood": space_data.get('district', 'Unknown'),
"area_sqm": area_sqm,
"perimeter_m": int(4 * (area_sqm ** 0.5)), # Rough estimate
# Environmental features from real tree data
"environmental": {
"tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
"shade_quality": tree_response.shade_analysis.shade_quality_score,
"noise_level": self._estimate_noise_level(space_data),
"wildlife_diversity_score": tree_response.metrics.species_diversity_score,
"water_features": self._detect_water_features(space_data),
"natural_surface_percent": self._estimate_natural_surface(space_data.get('fclass', ''))
},
# Real tree metrics from your existing service
"tree_data": {
"total_trees": tree_response.metrics.total_trees,
"trees_per_hectare": tree_response.metrics.trees_per_hectare,
"species_count": len(tree_response.metrics.dominant_species),
"species_diversity_score": tree_response.metrics.species_diversity_score,
"mature_trees_count": tree_response.metrics.mature_trees_count,
"young_trees_count": tree_response.metrics.young_trees_count,
"average_tree_age": tree_response.metrics.average_tree_age,
"average_height": tree_response.metrics.average_height,
"average_crown_diameter": tree_response.metrics.average_crown_diameter,
"shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
"dominant_species": tree_response.metrics.dominant_species[:3] # Top 3
},
# Real toilet accessibility from your existing service
"toilet_accessibility": {
"nearby_toilets_count": len(nearby_toilets),
"accessibility_score": toilet_score,
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
"free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
"accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
},
# Accessibility features
"accessibility": {
"wheelchair_accessible": True,
"public_transport_score": self._estimate_transport_score(space_data.get('district', '')),
"cycling_infrastructure": area_sqm > 5000,
"parking_availability": 3 if area_sqm > 50000 else 2,
"lighting_quality": 3 if 'mitte' in space_data.get('district', '').lower() else 2
},
"recreation": {
"playground_quality": self._estimate_playground_quality(space_data),
"sports_facilities": self._estimate_sports_facilities(space_data),
"running_paths": area_sqm > 8000,
"cycling_paths": area_sqm > 15000,
"dog_friendly": True,
"bbq_allowed": self._allows_bbq(space_data)
},
# OSM metadata
"osm_metadata": {
"osm_id": space_data.get('osm_id'),
"has_official_name": space_data.get('has_name', False),
"tags": space_data.get('osm_tags', {}),
"source": "filtered_osm_extract"
},
"last_updated": datetime.now().isoformat(),
"data_sources": ["filtered_osm_extract", "berlin_tree_cadastre", "berlin_toilets"],
"confidence_score": 95 if space_data.get('has_name') else 85
}
return enhanced_space
except Exception as e:
print(f"❌ Error enhancing {space_data['name']}: {e}")
return None
def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
if not nearby_toilets:
return 25
nearest = nearby_toilets[0]['distance_meters']
if nearest <= 200:
score = 95
elif nearest <= 400:
score = 80
elif nearest <= 600:
score = 65
else:
score = 45
# Quality bonuses
free = len([t for t in nearby_toilets if t.get('is_free', False)])
accessible = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
score += min(10, free * 5 + accessible * 3)
return min(100, score)
def _map_to_space_type(self, fclass: str) -> str:
mapping = {
'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN', 'wood': 'FOREST',
'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
'common': 'PARK', 'village_green': 'GRASS', 'heath': 'HEATH'
}
return mapping.get(fclass, 'PARK')
def _detect_water_features(self, space_data: Dict) -> bool:
name = space_data.get('name', '').lower()
tags = space_data.get('osm_tags', {})
water_keywords = ['see', 'teich', 'pond', 'lake', 'bach', 'spree', 'wasser', 'fluss']
return (any(keyword in name for keyword in water_keywords) or
'water' in str(tags.values()).lower())
def _estimate_noise_level(self, space_data: Dict) -> int:
fclass = space_data.get('fclass', '')
district = space_data.get('district', '')
area = space_data.get('area_sqm', 0)
base = {'forest': 1, 'wood': 1, 'nature_reserve': 1, 'heath': 1,
'meadow': 2, 'grass': 2, 'park': 2, 'garden': 2,
'playground': 3, 'recreation_ground': 3}.get(fclass, 2)
# Central districts are noisier
if any(busy in district.lower() for busy in ['mitte', 'kreuzberg', 'friedrichshain']):
base += 1
# Larger spaces are usually quieter inside
if area > 50000:
base = max(1, base - 1)
return min(5, base)
def _estimate_natural_surface(self, fclass: str) -> int:
return {'forest': 95, 'wood': 95, 'nature_reserve': 95, 'heath': 90,
'meadow': 95, 'grass': 90, 'park': 80, 'garden': 70,
'playground': 45, 'recreation_ground': 75}.get(fclass, 75)
def _estimate_transport_score(self, district: str) -> int:
district_lower = district.lower()
if 'mitte' in district_lower:
return 5
elif any(name in district_lower for name in ['charlottenburg', 'kreuzberg', 'friedrichshain', 'pankow']):
return 4
else:
return 3
def _estimate_playground_quality(self, space_data: Dict) -> int:
fclass = space_data.get('fclass', '')
tags = space_data.get('osm_tags', {})
area = space_data.get('area_sqm', 0)
if fclass == 'playground':
return 85
elif 'playground' in str(tags.values()).lower():
return 75
elif fclass == 'park':
# Larger parks more likely to have good playgrounds
return 60 if area > 10000 else 45
else:
return 30
def _estimate_sports_facilities(self, space_data: Dict) -> bool:
fclass = space_data.get('fclass', '')
tags = space_data.get('osm_tags', {})
name = space_data.get('name', '').lower()
area = space_data.get('area_sqm', 0)
# Explicit indicators
if (fclass == 'recreation_ground' or
'sport' in str(tags.values()).lower() or
any(term in name for term in ['sport', 'football', 'tennis', 'recreation'])):
return True
# Large parks often have sports facilities
return fclass == 'park' and area > 20000
def _allows_bbq(self, space_data: Dict) -> bool:
fclass = space_data.get('fclass', '')
tags = space_data.get('osm_tags', {})
area = space_data.get('area_sqm', 0)
# Check explicit BBQ tags
bbq_tag = tags.get('bbq', '').lower()
if bbq_tag == 'yes':
return True
elif bbq_tag == 'no':
return False
# Default based on type and size
return fclass in ['park', 'recreation_ground'] and area > 5000
async def process_filtered_green_spaces(self):
"""Main processing pipeline for filtered green spaces."""
print("🌳 Processing Significant Berlin Green Spaces")
print("=" * 55)
print(f"• Filtering for spaces ≥ {self.min_area_sqm:,} sqm")
print(f"• Processing up to {self.max_spaces} significant spaces")
print(f"• Enhancing with real Berlin tree + toilet data")
print("=" * 55)
# Step 1: Parse and filter OSM data
filtered_spaces = self.parse_and_filter_osm_data()
if not filtered_spaces:
print("❌ No significant green spaces found")
return []
print(f"\n🔧 Enhancing {len(filtered_spaces)} significant spaces with real data...")
# Step 2: Enhance with real data
enhanced_spaces = []
for i, space_data in enumerate(filtered_spaces, 1):
area_ha = space_data['area_sqm'] / 10000
print(f"[{i:3d}/{len(filtered_spaces)}] {space_data['name'][:40]:40} ({area_ha:.1f} ha)")
result = await self.enhance_green_space_with_real_data(space_data)
if result:
enhanced_spaces.append(result)
trees = result["tree_data"]["total_trees"]
toilets = result["toilet_accessibility"]["nearby_toilets_count"]
print(f"{trees:3d} trees, {toilets} toilets")
else:
print(f" ❌ Enhancement failed")
# Progress update every 50 spaces
if i % 50 == 0:
print(f"\n 📊 Progress: {len(enhanced_spaces)}/{i} enhanced successfully")
# Small delay to be nice to services
await asyncio.sleep(0.1)
print(f"\n🎉 Successfully enhanced {len(enhanced_spaces)} significant green spaces!")
return enhanced_spaces
def save_enhanced_data(self, enhanced_spaces: List[Dict]):
"""Save the filtered and enhanced dataset."""
output_file = self.processed_dir / "significant_berlin_green_spaces.json"
# Calculate comprehensive statistics
with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)
total_area = sum(s["area_sqm"] for s in enhanced_spaces)
# Named vs unnamed spaces
named_spaces = len([s for s in enhanced_spaces if s["osm_metadata"]["has_official_name"]])
# Area distribution
large_spaces = len([s for s in enhanced_spaces if s["area_sqm"] > 50000]) # > 5 hectares
medium_spaces = len([s for s in enhanced_spaces if 10000 <= s["area_sqm"] <= 50000]) # 1-5 hectares
small_spaces = len([s for s in enhanced_spaces if s["area_sqm"] < 10000]) # < 1 hectare
# District breakdown
by_district = {}
for space in enhanced_spaces:
district = space['neighborhood']
if district not in by_district:
by_district[district] = []
by_district[district].append(space)
data = {
"green_spaces": enhanced_spaces,
"total_count": len(enhanced_spaces),
"last_updated": datetime.now().isoformat(),
"data_sources": [
"filtered_osm_extract_significant_spaces_only",
"berlin_tree_cadastre_via_street_tree_service",
"berlin_toilet_locations_via_berlin_data_service"
],
"processing_info": {
"filtering_criteria": {
"minimum_area_sqm": self.min_area_sqm,
"maximum_spaces_processed": self.max_spaces,
"includes_only_significant_spaces": True
},
"enhancement_method": "real_berlin_tree_and_toilet_data",
"coordinate_system": "WGS84"
},
"summary_stats": {
"total_spaces": len(enhanced_spaces),
"spaces_with_tree_data": with_trees,
"spaces_with_toilet_data": with_toilets,
"total_trees_analyzed": total_trees,
"total_area_hectares": round(total_area / 10000, 1),
"coverage_rates": {
"tree_data": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%",
"toilet_data": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%"
},
"space_categories": {
"named_spaces": named_spaces,
"unnamed_spaces": len(enhanced_spaces) - named_spaces,
"large_spaces_over_5ha": large_spaces,
"medium_spaces_1_5ha": medium_spaces,
"smaller_spaces_under_1ha": small_spaces
}
},
"district_breakdown": {
district: len(spaces) for district, spaces in by_district.items()
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\n📁 Comprehensive dataset saved: {output_file}")
print(f"\n📊 Final Statistics:")
print(f" 🌳 {len(enhanced_spaces)} significant green spaces")
print(f" 📛 {named_spaces} with official names, {len(enhanced_spaces) - named_spaces} discovered areas")
print(f" 🌲 {with_trees} spaces with tree data ({round((with_trees/len(enhanced_spaces))*100)}%)")
print(f" 🚻 {with_toilets} spaces with toilet data ({round((with_toilets/len(enhanced_spaces))*100)}%)")
print(f" 🌿 {total_trees:,} total trees analyzed")
print(f" 📏 {round(total_area/10000, 1)} hectares total area")
print(f"\n🏙️ District Distribution:")
for district, spaces in sorted(by_district.items(), key=lambda x: len(x[1]), reverse=True):
print(f"{district}: {len(spaces)} spaces")
print(f"\n📈 Size Categories:")
print(f" • Large (>5 ha): {large_spaces} spaces")
print(f" • Medium (1-5 ha): {medium_spaces} spaces")
print(f" • Smaller (<1 ha): {small_spaces} spaces")
print(f"\n✨ This dataset provides comprehensive coverage of Berlin's")
print(f" significant green spaces with real tree and toilet data!")
return output_file
async def main():
processor = FilteredOSMProcessor()
try:
enhanced_spaces = await processor.process_filtered_green_spaces()
if enhanced_spaces:
processor.save_enhanced_data(enhanced_spaces)
print(f"\n🎯 SUCCESS! Ready to use in your API for accurate personality scoring!")
else:
print("❌ No spaces were successfully processed.")
except KeyboardInterrupt:
print("\n⚠️ Process interrupted by user")
except Exception as e:
print(f"❌ Error: {e}")
raise
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,613 @@
#!/usr/bin/env python3
"""
Process Berlin green spaces from local OSM data file.
Downloads Berlin OSM extract once, then processes locally without API dependencies.
"""
import json
import requests
import asyncio
import xml.etree.ElementTree as ET
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional, Tuple
import sys
import gzip
import math
# Add the app directory to Python path to import services
sys.path.append(str(Path(__file__).parent.parent))
from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService
class LocalOSMProcessor:
def __init__(self, data_dir: str = "app/data"):
self.data_dir = Path(data_dir)
self.raw_dir = self.data_dir / "osm-raw"
self.processed_dir = self.data_dir / "processed"
# Create directories
self.raw_dir.mkdir(parents=True, exist_ok=True)
self.processed_dir.mkdir(parents=True, exist_ok=True)
# Initialize existing services
self.tree_service = StreetTreeService()
self.berlin_data = BerlinDataService()
# Berlin bounding box for filtering
self.berlin_bbox = {
'min_lat': 52.3370, 'max_lat': 52.6755,
'min_lon': 13.0882, 'max_lon': 13.7611
}
def download_berlin_osm_extract(self):
"""Download Berlin OSM extract from Geofabrik."""
osm_file = self.raw_dir / "berlin-latest.osm.pbf"
if osm_file.exists():
print(f"✅ OSM file already exists: {osm_file}")
return osm_file
# Try PBF format first (smaller), fallback to XML
urls = [
"https://download.geofabrik.de/europe/germany/berlin-latest.osm.pbf",
"https://download.geofabrik.de/europe/germany/berlin-latest.osm.bz2"
]
for url in urls:
try:
print(f"Downloading Berlin OSM data from {url}")
print("This is a one-time download (~50MB)...")
response = requests.get(url, stream=True, timeout=300)
response.raise_for_status()
filename = url.split('/')[-1]
local_file = self.raw_dir / filename
# Download with progress
total_size = int(response.headers.get('content-length', 0))
downloaded = 0
with open(local_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
percent = (downloaded / total_size) * 100
print(f"\rDownload progress: {percent:.1f}%", end="")
print(f"\n✅ Downloaded: {local_file}")
return local_file
except Exception as e:
print(f"❌ Failed to download {url}: {e}")
continue
raise Exception("Could not download OSM data from any source")
def download_simple_osm_extract(self):
"""Download simpler XML format if PBF tools not available."""
osm_file = self.raw_dir / "berlin_green_spaces.osm"
if osm_file.exists():
print(f"✅ OSM file already exists: {osm_file}")
return osm_file
# Use Overpass API to get a one-time export of green spaces
print("Downloading Berlin green spaces extract...")
overpass_url = "http://overpass-api.de/api/interpreter"
# Query for all green spaces in Berlin (one-time download)
query = f"""
[out:xml][timeout:120];
(
way["leisure"~"^(park|garden|nature_reserve|recreation_ground|playground|common)$"]
({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
way["landuse"~"^(forest|grass|meadow|recreation_ground|village_green|allotments)$"]
({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
way["natural"~"^(forest|grass|meadow|scrub|heath|wood)$"]
({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
);
out geom meta;
"""
try:
response = requests.post(overpass_url, data=query, timeout=180)
response.raise_for_status()
with open(osm_file, 'w', encoding='utf-8') as f:
f.write(response.text)
print(f"✅ Downloaded green spaces extract: {osm_file}")
return osm_file
except Exception as e:
print(f"❌ Failed to download OSM extract: {e}")
raise
def parse_osm_xml(self, osm_file: Path) -> List[Dict]:
"""Parse OSM XML file to extract green spaces."""
print(f"Parsing OSM data from {osm_file}...")
green_spaces = []
try:
# Handle different file formats
if osm_file.suffix == '.gz':
with gzip.open(osm_file, 'rt', encoding='utf-8') as f:
tree = ET.parse(f)
else:
tree = ET.parse(osm_file)
root = tree.getroot()
# Parse ways (areas)
ways = root.findall('.//way')
print(f"Found {len(ways)} ways in OSM data")
for way in ways:
try:
processed_space = self._process_osm_way(way, root)
if processed_space:
green_spaces.append(processed_space)
except Exception as e:
continue
print(f"✅ Extracted {len(green_spaces)} green spaces from OSM data")
return green_spaces
except Exception as e:
print(f"❌ Error parsing OSM file: {e}")
return []
def _process_osm_way(self, way, root) -> Optional[Dict]:
"""Process a single OSM way into green space format."""
# Get tags
tags = {}
for tag in way.findall('tag'):
tags[tag.get('k')] = tag.get('v')
# Check if it's a green space
green_space_type = self._get_green_space_type(tags)
if not green_space_type:
return None
# Get node references
nd_refs = [nd.get('ref') for nd in way.findall('nd')]
if len(nd_refs) < 3: # Need at least 3 points for an area
return None
# Find node coordinates
coordinates = []
for nd_ref in nd_refs:
node = root.find(f".//node[@id='{nd_ref}']")
if node is not None:
lat = float(node.get('lat'))
lon = float(node.get('lon'))
# Check if within Berlin bounds
if (self.berlin_bbox['min_lat'] <= lat <= self.berlin_bbox['max_lat'] and
self.berlin_bbox['min_lon'] <= lon <= self.berlin_bbox['max_lon']):
coordinates.append((lat, lon))
if len(coordinates) < 3:
return None
# Calculate centroid and area
centroid_lat, centroid_lon = self._calculate_centroid(coordinates)
area_sqm = self._calculate_area(coordinates)
# Skip very small areas
if area_sqm < 500:
return None
# Get name
name = tags.get('name', f"{green_space_type.title()} near {centroid_lat:.3f}, {centroid_lon:.3f}")
# Estimate district
district = self._estimate_district(centroid_lat, centroid_lon)
return {
'id': f"osm_way_{way.get('id')}",
'name': name,
'fclass': green_space_type,
'lat': centroid_lat,
'lng': centroid_lon,
'area_sqm': int(area_sqm),
'district': district,
'osm_tags': tags,
'osm_id': way.get('id')
}
def _get_green_space_type(self, tags: Dict) -> Optional[str]:
"""Determine if tags represent a green space and what type."""
# Check leisure tags
leisure = tags.get('leisure', '')
if leisure in ['park', 'garden', 'nature_reserve', 'recreation_ground',
'playground', 'common', 'golf_course']:
return leisure
# Check landuse tags
landuse = tags.get('landuse', '')
if landuse in ['forest', 'grass', 'meadow', 'recreation_ground',
'village_green', 'allotments']:
return landuse
# Check natural tags
natural = tags.get('natural', '')
if natural in ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood']:
return natural
return None
def _calculate_centroid(self, coordinates: List[Tuple[float, float]]) -> Tuple[float, float]:
"""Calculate centroid of polygon."""
lat_sum = sum(coord[0] for coord in coordinates)
lon_sum = sum(coord[1] for coord in coordinates)
count = len(coordinates)
return lat_sum / count, lon_sum / count
def _calculate_area(self, coordinates: List[Tuple[float, float]]) -> float:
"""Calculate area of polygon using shoelace formula."""
if len(coordinates) < 3:
return 0
# Convert to approximate meters for Berlin
lat_to_m = 111000 # meters per degree latitude
lon_to_m = 111000 * math.cos(math.radians(52.5)) # adjust for Berlin latitude
# Convert coordinates to meters
coords_m = [(lat * lat_to_m, lon * lon_to_m) for lat, lon in coordinates]
# Shoelace formula
area = 0
n = len(coords_m)
for i in range(n):
j = (i + 1) % n
area += coords_m[i][0] * coords_m[j][1]
area -= coords_m[j][0] * coords_m[i][1]
return abs(area) / 2
def _estimate_district(self, lat: float, lng: float) -> str:
"""Rough district estimation from coordinates."""
# Very rough Berlin district boundaries
if lat > 52.55:
return "Pankow" if lng < 13.45 else "Lichtenberg"
elif lat > 52.52:
if lng < 13.25:
return "Charlottenburg-Wilmersdorf"
elif lng < 13.42:
return "Mitte"
else:
return "Friedrichshain-Kreuzberg"
elif lat > 52.45:
if lng < 13.25:
return "Steglitz-Zehlendorf"
elif lng < 13.42:
return "Tempelhof-Schöneberg"
else:
return "Neukölln"
else:
return "Treptow-Köpenick"
async def enhance_green_space_with_real_data(self, space_data: Dict):
"""Enhance green space with real tree and toilet data."""
try:
lat = space_data['lat']
lng = space_data['lng']
area_sqm = space_data['area_sqm']
print(f"Enhancing {space_data['name']} ({space_data['district']})...")
# Adaptive radius
radius = min(350, max(100, int((area_sqm ** 0.5) * 0.7)))
# Get real data using existing services
tree_response = await self.tree_service.get_trees_near_location(
lat, lng, radius_m=radius
)
nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 600)
# Calculate scores
toilet_score = self._score_toilet_accessibility(nearby_toilets)
space_type = self._map_to_space_type(space_data.get('fclass', ''))
enhanced_space = {
"id": space_data['id'],
"name": space_data['name'],
"description": f"Berlin {space_data.get('fclass', 'green space')} from local OSM data",
"type": space_type,
"coordinates": {
"lat": float(lat),
"lng": float(lng)
},
"neighborhood": space_data.get('district', 'Unknown'),
"area_sqm": area_sqm,
"perimeter_m": int(4 * (area_sqm ** 0.5)),
# Environmental features from real tree data
"environmental": {
"tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
"shade_quality": tree_response.shade_analysis.shade_quality_score,
"noise_level": self._estimate_noise_level(space_data),
"wildlife_diversity_score": tree_response.metrics.species_diversity_score,
"water_features": self._detect_water_features(space_data),
"natural_surface_percent": self._estimate_natural_surface(space_data.get('fclass', ''))
},
# Real tree metrics
"tree_data": {
"total_trees": tree_response.metrics.total_trees,
"trees_per_hectare": tree_response.metrics.trees_per_hectare,
"species_count": len(tree_response.metrics.dominant_species),
"species_diversity_score": tree_response.metrics.species_diversity_score,
"mature_trees_count": tree_response.metrics.mature_trees_count,
"young_trees_count": tree_response.metrics.young_trees_count,
"average_tree_age": tree_response.metrics.average_tree_age,
"average_height": tree_response.metrics.average_height,
"average_crown_diameter": tree_response.metrics.average_crown_diameter,
"shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
"dominant_species": tree_response.metrics.dominant_species[:3]
},
# Real toilet accessibility
"toilet_accessibility": {
"nearby_toilets_count": len(nearby_toilets),
"accessibility_score": toilet_score,
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
"free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
"accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
},
# Standard features
"accessibility": {
"wheelchair_accessible": True,
"public_transport_score": self._estimate_transport_score(space_data.get('district', '')),
"cycling_infrastructure": area_sqm > 4000,
"parking_availability": 2 if area_sqm > 20000 else 1,
"lighting_quality": 3 if 'mitte' in space_data.get('district', '').lower() else 2
},
"recreation": {
"playground_quality": self._estimate_playground_quality(space_data),
"sports_facilities": self._estimate_sports_facilities(space_data),
"running_paths": area_sqm > 6000,
"cycling_paths": area_sqm > 12000,
"dog_friendly": True,
"bbq_allowed": self._allows_bbq(space_data)
},
# OSM metadata
"osm_metadata": {
"osm_id": space_data.get('osm_id'),
"tags": space_data.get('osm_tags', {}),
"source": "local_osm_extract"
},
"last_updated": datetime.now().isoformat(),
"data_sources": ["local_osm_extract", "berlin_tree_cadastre", "berlin_toilets"],
"confidence_score": 92
}
trees = tree_response.metrics.total_trees
toilets = len(nearby_toilets)
print(f"{space_data['name']}: {trees} trees, {toilets} toilets")
return enhanced_space
except Exception as e:
print(f"❌ Error enhancing {space_data['name']}: {e}")
return None
def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
if not nearby_toilets:
return 25
nearest = nearby_toilets[0]['distance_meters']
if nearest <= 200:
score = 90
elif nearest <= 400:
score = 70
else:
score = 50
# Quality bonuses
free = len([t for t in nearby_toilets if t.get('is_free', False)])
accessible = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
score += min(10, free * 5 + accessible * 3)
return min(100, score)
def _map_to_space_type(self, fclass: str) -> str:
mapping = {
'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN', 'wood': 'FOREST',
'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
'common': 'PARK', 'village_green': 'GRASS', 'allotments': 'GARDEN'
}
return mapping.get(fclass, 'PARK')
def _detect_water_features(self, space_data: Dict) -> bool:
name = space_data.get('name', '').lower()
tags = space_data.get('osm_tags', {})
water_keywords = ['see', 'teich', 'pond', 'lake', 'bach', 'spree', 'wasser']
return any(keyword in name for keyword in water_keywords) or 'water' in tags.values()
def _estimate_noise_level(self, space_data: Dict) -> int:
fclass = space_data.get('fclass', '')
district = space_data.get('district', '')
base = {'forest': 1, 'wood': 1, 'nature_reserve': 1, 'meadow': 2,
'park': 2, 'garden': 2, 'playground': 3}.get(fclass, 2)
if any(busy in district.lower() for busy in ['mitte', 'kreuzberg', 'friedrichshain']):
base += 1
return min(5, base)
def _estimate_natural_surface(self, fclass: str) -> int:
return {'forest': 95, 'wood': 95, 'nature_reserve': 90, 'meadow': 95,
'grass': 85, 'park': 75, 'garden': 65, 'playground': 40}.get(fclass, 70)
def _estimate_transport_score(self, district: str) -> int:
district_lower = district.lower()
if 'mitte' in district_lower:
return 5
elif any(name in district_lower for name in ['charlottenburg', 'kreuzberg', 'friedrichshain']):
return 4
else:
return 3
def _estimate_playground_quality(self, space_data: Dict) -> int:
fclass = space_data.get('fclass', '')
tags = space_data.get('osm_tags', {})
if fclass == 'playground':
return 80
elif 'playground' in tags.values():
return 75
elif fclass == 'park':
return 55
else:
return 30
def _estimate_sports_facilities(self, space_data: Dict) -> bool:
fclass = space_data.get('fclass', '')
tags = space_data.get('osm_tags', {})
name = space_data.get('name', '').lower()
return (fclass == 'recreation_ground' or
'sport' in str(tags.values()).lower() or
any(term in name for term in ['sport', 'football', 'tennis']))
def _allows_bbq(self, space_data: Dict) -> bool:
fclass = space_data.get('fclass', '')
area = space_data.get('area_sqm', 0)
tags = space_data.get('osm_tags', {})
# Check explicit BBQ tags
if tags.get('bbq') == 'yes':
return True
elif tags.get('bbq') == 'no':
return False
# Default based on type and size
return fclass in ['park', 'recreation_ground'] and area > 5000
async def process_all_green_spaces(self):
"""Main processing pipeline."""
print("🌳 Processing Berlin green spaces from local OSM data...")
# Step 1: Get OSM data
try:
osm_file = self.download_simple_osm_extract() # More reliable than PBF
except:
print("❌ Could not download OSM data")
return []
# Step 2: Parse green spaces
green_spaces = self.parse_osm_xml(osm_file)
if not green_spaces:
print("❌ No green spaces found in OSM data")
return []
print(f"📊 Found {len(green_spaces)} green spaces to enhance")
# Step 3: Enhance with real data
enhanced_spaces = []
for i, space_data in enumerate(green_spaces, 1):
print(f"[{i}/{len(green_spaces)}]", end=" ")
result = await self.enhance_green_space_with_real_data(space_data)
if result:
enhanced_spaces.append(result)
if i % 20 == 0:
print(f"\n Progress: {len(enhanced_spaces)} enhanced so far...")
await asyncio.sleep(0.1)
print(f"\n✅ Enhanced {len(enhanced_spaces)} spaces with real data!")
return enhanced_spaces
def save_enhanced_data(self, enhanced_spaces: List[Dict]):
"""Save the final dataset."""
output_file = self.processed_dir / "osm_berlin_green_spaces_enhanced.json"
# Calculate statistics
with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)
data = {
"green_spaces": enhanced_spaces,
"total_count": len(enhanced_spaces),
"last_updated": datetime.now().isoformat(),
"data_sources": [
"local_osm_extract_processed_offline",
"berlin_tree_cadastre",
"berlin_toilets"
],
"processing_info": {
"method": "local_osm_processing_no_api_dependency",
"includes_all_osm_green_spaces": True,
"enhanced_with_real_berlin_data": True
},
"summary_stats": {
"total_spaces": len(enhanced_spaces),
"spaces_with_tree_data": with_trees,
"spaces_with_toilet_data": with_toilets,
"total_trees_analyzed": total_trees,
"tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%",
"toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%"
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Saved comprehensive dataset: {output_file}")
print(f"📊 {len(enhanced_spaces)} total green spaces")
print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data")
print(f"🌿 {total_trees} total trees analyzed")
print(f"\n✨ Ready to replace mock data in your API!")
return output_file
async def main():
processor = LocalOSMProcessor()
try:
print("🚀 Berlin Green Spaces: Local OSM Processing")
print("=" * 50)
print("• Downloads OSM data once (no API dependency)")
print("• Processes locally for all green spaces")
print("• Enhances with real Berlin tree + toilet data")
print("=" * 50)
enhanced_spaces = await processor.process_all_green_spaces()
if enhanced_spaces:
processor.save_enhanced_data(enhanced_spaces)
except KeyboardInterrupt:
print("\n⚠️ Interrupted")
except Exception as e:
print(f"❌ Error: {e}")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,558 @@
#!/usr/bin/env python3
"""
Quick Berlin green spaces processor.
Pre-filters OSM data efficiently, then processes only the best candidates.
"""
import json
import asyncio
import xml.etree.ElementTree as ET
from pathlib import Path
from datetime import datetime
import sys
import re
import math
# from tqdm.asyncio import tqdm # Not available, remove tqdm dependency
from xml.etree.ElementTree import iterparse
# Add the app directory to Python path
sys.path.append(str(Path(__file__).parent.parent))
from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService
def calculate_polygon_area_sqm(coords):
"""Calculate area of a polygon using the Shoelace formula."""
if len(coords) < 3:
return 5000 # Default for invalid polygons
# Convert to radians and use spherical approximation for Earth
def to_radians(deg):
return deg * math.pi / 180
# Use simple planar approximation for small areas
# Convert lat/lng to approximate meters (rough approximation for Berlin area)
lat_center = sum(lat for lat, lng in coords) / len(coords)
lng_center = sum(lng for lat, lng in coords) / len(coords)
# Approximate meters per degree at Berlin latitude
meters_per_lat = 111320 # roughly constant
meters_per_lng = 111320 * math.cos(to_radians(lat_center))
# Convert coordinates to meters relative to center
meter_coords = []
for lat, lng in coords:
x = (lng - lng_center) * meters_per_lng
y = (lat - lat_center) * meters_per_lat
meter_coords.append((x, y))
# Shoelace formula
area = 0
n = len(meter_coords)
for i in range(n):
j = (i + 1) % n
area += meter_coords[i][0] * meter_coords[j][1]
area -= meter_coords[j][0] * meter_coords[i][1]
area = abs(area) / 2
# Reasonable bounds check
if area < 100: # Too small
return 5000
elif area > 10000000: # Too large (10 km²)
return 500000 # Cap at reasonable park size
return int(area)
def calculate_search_radius(area_sqm):
"""Calculate appropriate tree search radius based on park area."""
if area_sqm < 10000: # < 1 hectare
return 150
elif area_sqm < 50000: # < 5 hectares
return 300
elif area_sqm < 200000: # < 20 hectares
return 500
else: # Large parks like Treptower Park
return 800
def calculate_enhanced_shade_quality(tree_response, area_sqm):
"""Calculate enhanced shade quality based on real tree characteristics."""
metrics = tree_response.metrics
shade_analysis = tree_response.shade_analysis
# Base score from tree density and coverage
base_score = 0
# Factor 1: Actual shade coverage (crown area based)
coverage = metrics.shade_coverage_percent or 0
if coverage >= 60:
base_score += 40
elif coverage >= 40:
base_score += 30
elif coverage >= 20:
base_score += 20
elif coverage >= 10:
base_score += 10
# Factor 2: Large mature trees (better shade)
large_trees = len(shade_analysis.nearby_large_trees or [])
if large_trees >= 10:
base_score += 25
elif large_trees >= 5:
base_score += 20
elif large_trees >= 3:
base_score += 15
elif large_trees >= 1:
base_score += 10
# Factor 3: Tree density per area
trees_per_hectare = metrics.trees_per_hectare or 0
if trees_per_hectare >= 50:
base_score += 20
elif trees_per_hectare >= 30:
base_score += 15
elif trees_per_hectare >= 20:
base_score += 10
elif trees_per_hectare >= 10:
base_score += 5
# Factor 4: Average tree height (taller = better shade)
avg_height = metrics.average_height or 0
if avg_height >= 20:
base_score += 10
elif avg_height >= 15:
base_score += 8
elif avg_height >= 10:
base_score += 5
elif avg_height >= 5:
base_score += 3
# Factor 5: Crown diameter quality
avg_crown = metrics.average_crown_diameter or 0
if avg_crown >= 12:
base_score += 5
elif avg_crown >= 8:
base_score += 3
elif avg_crown >= 5:
base_score += 1
return min(100, base_score)
def detect_water_features(candidate):
"""Detect water features using OSM tags and name analysis."""
tags = candidate.get('tags', {})
name = candidate.get('name', '').lower()
# Check OSM water-related tags
water_tags = ['water', 'waterway', 'natural']
has_water_tags = any(
tags.get(tag, '').lower() in ['water', 'lake', 'pond', 'reservoir', 'river', 'stream']
for tag in water_tags
)
# Check name for water indicators
water_names = ['see', 'teich', 'weiher', 'water', 'lake', 'pond', 'fluss', 'river', 'bach', 'creek']
has_water_name = any(water_word in name for water_word in water_names)
# Check for fountain/brunnen
fountain_indicators = ['brunnen', 'fountain', 'springbrunnen']
has_fountain = any(fountain in name for fountain in fountain_indicators)
return has_water_tags or has_water_name or has_fountain
def estimate_berlin_district(lat: float, lng: float) -> str:
"""Estimate Berlin district from coordinates using geographic boundaries."""
# Northern districts
if lat > 52.55:
if lng < 13.25:
return "Reinickendorf"
elif lng < 13.45:
return "Pankow"
else:
return "Lichtenberg"
# Central-north districts
elif lat > 52.52:
if lng < 13.20:
return "Spandau"
elif lng < 13.30:
return "Charlottenburg-Wilmersdorf"
elif lng < 13.42:
return "Mitte"
elif lng < 13.48:
return "Friedrichshain-Kreuzberg"
else:
return "Lichtenberg"
# Central districts
elif lat > 52.48:
if lng < 13.20:
return "Spandau"
elif lng < 13.30:
return "Charlottenburg-Wilmersdorf"
elif lng < 13.35:
return "Tempelhof-Schöneberg"
elif lng < 13.42:
return "Mitte"
elif lng < 13.48:
return "Friedrichshain-Kreuzberg"
else:
return "Lichtenberg"
# Southern-central districts
elif lat > 52.45:
if lng < 13.20:
return "Steglitz-Zehlendorf"
elif lng < 13.35:
return "Tempelhof-Schöneberg"
elif lng < 13.45:
return "Neukölln"
elif lng < 13.55:
return "Treptow-Köpenick"
else:
return "Marzahn-Hellersdorf"
# Southern districts
else:
if lng < 13.35:
return "Steglitz-Zehlendorf"
else:
return "Treptow-Köpenick"
def get_specific_neighborhood(district: str, lat: float, lng: float) -> str:
"""Get specific neighborhood within district based on coordinates."""
neighborhoods = {
"Mitte": {
(52.540, 52.560, 13.33, 13.38): "Wedding",
(52.515, 52.530, 13.33, 13.38): "Moabit",
(52.510, 52.520, 13.35, 13.38): "Tiergarten",
(52.525, 52.545, 13.40, 13.43): "Prenzlauer Berg"
},
"Charlottenburg-Wilmersdorf": {
(52.485, 52.505, 13.30, 13.33): "Wilmersdorf",
(52.505, 52.525, 13.25, 13.33): "Charlottenburg"
},
"Friedrichshain-Kreuzberg": {
(52.490, 52.510, 13.38, 13.42): "Kreuzberg",
(52.510, 52.525, 13.42, 13.48): "Friedrichshain"
},
"Tempelhof-Schöneberg": {
(52.480, 52.500, 13.33, 13.37): "Schöneberg",
(52.460, 52.480, 13.37, 13.42): "Tempelhof"
},
"Steglitz-Zehlendorf": {
(52.430, 52.450, 13.23, 13.30): "Zehlendorf",
(52.450, 52.470, 13.30, 13.35): "Steglitz"
},
"Treptow-Köpenick": {
(52.430, 52.460, 13.55, 13.65): "Köpenick",
(52.480, 52.500, 13.45, 13.50): "Treptow"
}
}
if district in neighborhoods:
for (min_lat, max_lat, min_lng, max_lng), neighborhood in neighborhoods[district].items():
if min_lat <= lat <= max_lat and min_lng <= lng <= max_lng:
return neighborhood
return district
async def quick_process():
"""Quick processing of significant Berlin green spaces."""
print("🚀 Quick Berlin Green Spaces Processor")
print("=" * 45)
# Initialize services
tree_service = StreetTreeService()
berlin_data = BerlinDataService()
# Pre-load and index trees once to avoid repeated indexing
print("🔄 Pre-loading tree data and building spatial index...")
await tree_service._load_trees()
osm_file = Path("app/data/osm-raw/berlin_green_spaces.osm")
if not osm_file.exists():
print("❌ OSM file not found. Please ensure data is downloaded.")
return
print("🔍 Quick filtering for named parks and significant areas...")
print(f"📁 OSM file size: {osm_file.stat().st_size / (1024*1024):.1f} MB")
# Quick scan for good candidates
candidates = []
try:
processed = 0
print("🔍 Single-pass XML parsing - ways with embedded coordinates...")
# Single pass: parse ways with embedded coordinates
ways_processed = 0
current_way_tags = {}
current_way_coordinates = []
in_way = False
for event, elem in iterparse(osm_file, events=('start', 'end')):
if event == 'start':
if elem.tag == 'way':
in_way = True
current_way_tags = {}
current_way_coordinates = []
ways_processed += 1
if ways_processed % 1000 == 0:
print(f"Processed {ways_processed} ways, found {len(candidates)} candidates so far...")
elif in_way and elem.tag == 'tag':
k = elem.get('k')
v = elem.get('v')
if k and v:
current_way_tags[k] = v
elif in_way and elem.tag == 'nd':
# Extract coordinates directly from nd element
lat = elem.get('lat')
lon = elem.get('lon')
if lat and lon:
current_way_coordinates.append((float(lat), float(lon)))
continue
if elem.tag == 'way' and in_way:
in_way = False
tags = current_way_tags
coordinates = current_way_coordinates
# Quick filters for promising spaces - be more lenient
has_name = 'name' in tags
is_park = (tags.get('leisure') in ['park', 'garden', 'nature_reserve'] or
tags.get('landuse') in ['forest', 'grass', 'recreation_ground'])
# Also accept common green space tags
has_green_tags = any(key in tags for key in ['leisure', 'landuse', 'natural', 'amenity'])
if not (has_name or is_park or has_green_tags):
elem.clear() # Free memory
continue
# Use embedded coordinates directly
if not coordinates:
elem.clear() # Free memory
continue
# Get center coordinate and all coordinates for area calculation
lat, lng = coordinates[0] if len(coordinates) == 1 else (
sum(lat for lat, lng in coordinates) / len(coordinates),
sum(lng for lat, lng in coordinates) / len(coordinates)
)
# Basic Berlin bounds check
if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8):
elem.clear() # Free memory
continue
name = tags.get('name', f"Unnamed {tags.get('leisure', tags.get('landuse', 'area'))}")
space_type = tags.get('leisure') or tags.get('landuse') or 'park'
candidate = {
'id': f"quick_{elem.get('id')}",
'name': name,
'type': space_type,
'lat': lat,
'lng': lng,
'has_name': has_name,
'tags': tags,
'coordinates': coordinates # Store all coordinates for area calculation
}
candidates.append(candidate)
processed += 1
# Limit for quick processing
if len(candidates) >= 100:
elem.clear() # Free memory
break
elem.clear() # Free memory
else:
elem.clear() # Free memory
print(f"✅ Found {len(candidates)} promising green spaces")
except Exception as e:
print(f"❌ Error in quick filtering: {e}")
return
if not candidates:
print("No candidates found")
return
# Sort by having names (better quality)
candidates.sort(key=lambda x: x['has_name'], reverse=True)
print(f"\n🔧 Enhancing top {len(candidates)} spaces with real data...")
# Process candidates in parallel with batching
batch_size = 10 # Process 10 candidates at a time
enhanced_spaces = []
async def process_candidate(candidate):
"""Process a single candidate with tree and toilet data."""
try:
# Calculate actual area from OSM polygon coordinates
area_sqm = calculate_polygon_area_sqm(candidate.get('coordinates', []))
search_radius = calculate_search_radius(area_sqm)
# Get real tree data and toilet data concurrently with dynamic radius
tree_task = tree_service.get_trees_near_location(
candidate['lat'], candidate['lng'], radius_m=search_radius
)
toilet_task = berlin_data.get_toilets_near_point(
candidate['lat'], candidate['lng'], 500
)
print(f"🔍 Getting data for {candidate['name'][:30]}... (area: {area_sqm/10000:.1f}ha, radius: {search_radius}m)")
tree_response, nearby_toilets = await asyncio.gather(tree_task, toilet_task)
# Create enhanced space
enhanced_space = {
"id": candidate['id'],
"name": candidate['name'],
"description": f"Berlin {candidate['type']} discovered via quick OSM processing",
"type": "PARK", # Simplified for now
"coordinates": {
"lat": candidate['lat'],
"lng": candidate['lng']
},
"neighborhood": get_specific_neighborhood(estimate_berlin_district(candidate['lat'], candidate['lng']), candidate['lat'], candidate['lng']),
"area_sqm": area_sqm, # Real calculated area
# Environmental features from real tree data
"environmental": {
"tree_coverage_percent": max(5, int(tree_response.metrics.shade_coverage_percent)), # Use actual crown area calculation
"shade_quality": calculate_enhanced_shade_quality(tree_response, area_sqm),
"noise_level": 2, # Default
"wildlife_diversity_score": tree_response.metrics.species_diversity_score,
"water_features": detect_water_features(candidate),
"natural_surface_percent": 80
},
# Real tree data
"tree_data": {
"total_trees": tree_response.metrics.total_trees,
"trees_per_hectare": tree_response.metrics.trees_per_hectare,
"species_count": len(tree_response.metrics.dominant_species),
"species_diversity_score": tree_response.metrics.species_diversity_score,
"mature_trees_count": tree_response.metrics.mature_trees_count,
"young_trees_count": tree_response.metrics.young_trees_count,
"average_tree_age": tree_response.metrics.average_tree_age,
"average_height": tree_response.metrics.average_height,
"average_crown_diameter": tree_response.metrics.average_crown_diameter,
"shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
"dominant_species": tree_response.metrics.dominant_species[:3]
},
# Real toilet data
"toilet_accessibility": {
"nearby_toilets_count": len(nearby_toilets),
"accessibility_score": 80 if nearby_toilets else 30,
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
"free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
"accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
},
# Standard features
"accessibility": {
"wheelchair_accessible": True,
"public_transport_score": 3,
"cycling_infrastructure": True,
"parking_availability": 2,
"lighting_quality": 3
},
"recreation": {
"playground_quality": 60 if candidate['type'] == 'park' else 30,
"sports_facilities": candidate['type'] == 'recreation_ground',
"running_paths": True,
"cycling_paths": True,
"dog_friendly": True,
"bbq_allowed": candidate['type'] in ['park', 'recreation_ground']
},
"osm_metadata": {
"has_official_name": candidate['has_name'],
"tags": candidate['tags'],
"source": "quick_osm_processing"
},
"last_updated": datetime.now().isoformat(),
"data_sources": ["quick_osm_scan", "berlin_tree_cadastre", "berlin_toilets"],
"confidence_score": 90 if candidate['has_name'] else 75
}
return enhanced_space, tree_response.metrics.total_trees, len(nearby_toilets)
except Exception as e:
print(f"❌ Error processing {candidate['name']}: {e}")
return None, 0, 0
# Process candidates in batches with progress bar
for i in range(0, len(candidates), batch_size):
batch = candidates[i:i + batch_size]
print(f"Processing batch {i//batch_size + 1}/{(len(candidates) + batch_size - 1)//batch_size}")
# Process batch concurrently with progress bar
tasks = [process_candidate(candidate) for candidate in batch]
results = await asyncio.gather(*tasks)
# Collect results
for result, trees, toilets in results:
if result:
enhanced_spaces.append(result)
print(f"{result['name'][:40]:40} - {trees:3d} trees, {toilets} toilets")
# Small delay between batches to be respectful to APIs
if i + batch_size < len(candidates):
await asyncio.sleep(0.5)
# Save results
output_file = Path("app/data/processed/quick_berlin_green_spaces.json")
with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)
data = {
"green_spaces": enhanced_spaces,
"total_count": len(enhanced_spaces),
"last_updated": datetime.now().isoformat(),
"data_sources": ["quick_osm_processing", "berlin_tree_cadastre", "berlin_toilets"],
"processing_info": {
"method": "quick_scan_for_named_and_significant_spaces",
"prioritizes_named_spaces": True,
"enhanced_with_real_berlin_data": True
},
"summary_stats": {
"total_spaces": len(enhanced_spaces),
"spaces_with_tree_data": with_trees,
"spaces_with_toilet_data": with_toilets,
"total_trees_analyzed": total_trees,
"tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%",
"toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%"
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Quick processing complete!")
print(f"📁 Saved: {output_file}")
print(f"📊 {len(enhanced_spaces)} spaces enhanced")
print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data")
print(f"🌿 {total_trees} total trees analyzed")
print(f"\n✨ Ready to use! This gives you real Berlin green spaces")
print(f" with actual tree and toilet data for personality scoring!")
if __name__ == "__main__":
asyncio.run(quick_process())

View File

@ -0,0 +1,169 @@
#!/usr/bin/env python3
"""
Test OSM processing with a small sample to verify it works.
"""
import json
import asyncio
import xml.etree.ElementTree as ET
from pathlib import Path
from datetime import datetime
import sys
import math
# Add the app directory to Python path
sys.path.append(str(Path(__file__).parent.parent))
from app.services.street_tree_service import StreetTreeService
from app.services.berlin_data_service import BerlinDataService
async def test_processing():
"""Test the processing with a small sample."""
print("🧪 Testing OSM processing with sample data...")
# Initialize services
tree_service = StreetTreeService()
berlin_data = BerlinDataService()
# Parse OSM file and get first 5 green spaces as test
osm_file = Path("app/data/osm-raw/berlin_green_spaces.osm")
if not osm_file.exists():
print("❌ OSM file not found")
return
tree = ET.parse(osm_file)
root = tree.getroot()
ways = root.findall('.//way')
print(f"📊 Found {len(ways)} total ways in OSM file")
# Process first 5 green spaces as test
sample_spaces = []
processed_count = 0
for way in ways:
if processed_count >= 5:
break
# Get tags
tags = {}
for tag in way.findall('tag'):
tags[tag.get('k')] = tag.get('v')
# Check if it's a green space
green_space_type = None
leisure = tags.get('leisure', '')
landuse = tags.get('landuse', '')
natural = tags.get('natural', '')
if leisure in ['park', 'garden', 'nature_reserve']:
green_space_type = leisure
elif landuse in ['forest', 'grass', 'park']:
green_space_type = landuse
elif natural in ['forest', 'wood']:
green_space_type = natural
if not green_space_type:
continue
# Get coordinates from first and last node to estimate center
nd_refs = [nd.get('ref') for nd in way.findall('nd')]
if len(nd_refs) < 3:
continue
# Find first node coordinates
first_node = root.find(f".//node[@id='{nd_refs[0]}']")
if first_node is None:
continue
lat = float(first_node.get('lat'))
lng = float(first_node.get('lon'))
# Simple space data
space_data = {
'id': f"test_{way.get('id')}",
'name': tags.get('name', f"Test {green_space_type} {processed_count + 1}"),
'fclass': green_space_type,
'lat': lat,
'lng': lng,
'area_sqm': 5000, # Default for test
'district': 'Test District'
}
sample_spaces.append(space_data)
processed_count += 1
print(f"🌳 Testing with {len(sample_spaces)} sample green spaces...")
# Test enhancement with real data
enhanced_spaces = []
for i, space_data in enumerate(sample_spaces, 1):
print(f"\n[{i}/{len(sample_spaces)}] Testing {space_data['name']}...")
try:
# Get real tree data
tree_response = await tree_service.get_trees_near_location(
space_data['lat'], space_data['lng'], radius_m=200
)
# Get real toilet data
nearby_toilets = await berlin_data.get_toilets_near_point(
space_data['lat'], space_data['lng'], 500
)
# Create enhanced data
enhanced_space = {
"id": space_data['id'],
"name": space_data['name'],
"type": "PARK",
"coordinates": {
"lat": space_data['lat'],
"lng": space_data['lng']
},
"tree_data": {
"total_trees": tree_response.metrics.total_trees,
"species_count": len(tree_response.metrics.dominant_species),
"dominant_species": tree_response.metrics.dominant_species
},
"toilet_accessibility": {
"nearby_toilets_count": len(nearby_toilets),
"nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None
}
}
enhanced_spaces.append(enhanced_space)
trees = tree_response.metrics.total_trees
toilets = len(nearby_toilets)
print(f"✅ Success: {trees} trees, {toilets} toilets nearby")
except Exception as e:
print(f"❌ Error: {e}")
# Save test results
output_file = Path("app/data/processed/test_green_spaces.json")
test_data = {
"test_results": enhanced_spaces,
"total_tested": len(enhanced_spaces),
"osm_ways_available": len(ways),
"processing_successful": True,
"timestamp": datetime.now().isoformat()
}
with open(output_file, 'w') as f:
json.dump(test_data, f, indent=2)
print(f"\n🎉 Test completed successfully!")
print(f"📁 Test results saved: {output_file}")
print(f"📊 Enhanced {len(enhanced_spaces)} sample spaces")
print(f"💡 Ready to process all {len(ways)} green spaces!")
return True
if __name__ == "__main__":
asyncio.run(test_processing())