#!/usr/bin/env python3 """ Enhanced Berlin green space processor using existing tree and toilet services. Downloads OSM green space boundaries and enhances them with real data using existing services. """ import os import json import zipfile import requests import asyncio from pathlib import Path import geopandas as gpd import pandas as pd from datetime import datetime from typing import List, Dict, Optional import sys # Add the app directory to Python path to import services sys.path.append(str(Path(__file__).parent.parent)) from app.services.street_tree_service import StreetTreeService from app.services.berlin_data_service import BerlinDataService class RealDataGreenSpaceProcessor: def __init__(self, data_dir: str = "app/data"): self.data_dir = Path(data_dir) self.raw_dir = self.data_dir / "geo-raw" self.processed_dir = self.data_dir / "processed" # Create directories self.raw_dir.mkdir(parents=True, exist_ok=True) self.processed_dir.mkdir(parents=True, exist_ok=True) # Initialize existing services self.tree_service = StreetTreeService() self.berlin_data = BerlinDataService() def download_berlin_districts(self): """Download Berlin district boundaries.""" json_file = self.raw_dir / "bezirksgrenzen.geojson" if json_file.exists(): print(f"Berlin district data already exists: {json_file}") return json_file link = "https://tsb-opendata.s3.eu-central-1.amazonaws.com/bezirksgrenzen/bezirksgrenzen.geojson" print(f"Downloading Berlin district data from {link}") try: response = requests.get(link, timeout=30) response.raise_for_status() with open(json_file, 'wb') as f: f.write(response.content) print(f"Downloaded to {json_file}") return json_file except Exception as e: print(f"Error downloading districts: {e}") raise def download_osm_data(self): """Download Berlin OpenStreetMap data.""" zip_file = self.raw_dir / "berlin_shapes.zip" shp_dir = self.raw_dir / "berlin_shapes" # Check if already extracted required_files = ["gis_osm_landuse_a_free_1.shp", "gis_osm_natural_a_free_1.shp", "gis_osm_leisure_a_free_1.shp"] if all((shp_dir / f).exists() for f in required_files): print(f"Berlin OSM data already exists: {shp_dir}") return shp_dir if not zip_file.exists(): link = "https://download.geofabrik.de/europe/germany/berlin-latest-free.shp.zip" print(f"Downloading Berlin OSM data from {link} (this may take several minutes...)") try: response = requests.get(link, stream=True, timeout=300) # 5 min timeout response.raise_for_status() with open(zip_file, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print(f"Download completed: {zip_file}") except Exception as e: print(f"Error downloading OSM data: {e}") raise print(f"Extracting Berlin OSM data to {shp_dir}") try: with zipfile.ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(shp_dir) print(f"Extracted to {shp_dir}") except Exception as e: print(f"Error extracting OSM data: {e}") raise return shp_dir def load_osm_green_spaces(self): """Load OSM green space polygons.""" print("Loading OSM green space boundaries...") # Download required data districts_file = self.download_berlin_districts() shp_dir = self.download_osm_data() # Load Berlin districts for clipping districts = gpd.read_file(districts_file) # Define green space categories we want green_categories = { 'landuse': ['forest', 'grass', 'meadow', 'recreation_ground', 'village_green', 'allotments'], 'natural': ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood'], 'leisure': ['park', 'garden', 'nature_reserve', 'playground', 'pitch', 'common', 'golf_course'] } all_green_spaces = [] # Process each category for category, subcategories in green_categories.items(): shapefile = shp_dir / f"gis_osm_{category}_a_free_1.shp" if not shapefile.exists(): print(f"Warning: {shapefile} not found, skipping") continue print(f"Processing {category} data...") try: gdf = gpd.read_file(shapefile) # Filter to relevant subcategories gdf_filtered = gdf[gdf['fclass'].isin(subcategories)].copy() if len(gdf_filtered) == 0: print(f"No {category} features found in subcategories") continue # Clip to Berlin boundaries gdf_clipped = gpd.clip(gdf_filtered, districts) # Calculate area and filter out very small areas (< 1000 sqm) gdf_clipped['area_sqm'] = gdf_clipped.geometry.area gdf_clipped = gdf_clipped[gdf_clipped['area_sqm'] >= 1000] if len(gdf_clipped) > 0: all_green_spaces.append(gdf_clipped) print(f"Found {len(gdf_clipped)} {category} features") except Exception as e: print(f"Error processing {category}: {e}") continue if not all_green_spaces: raise ValueError("No green space data found") # Combine all green spaces green_spaces = gpd.GeoDataFrame(pd.concat(all_green_spaces, ignore_index=True)) # Add district information green_spaces = gpd.sjoin(green_spaces, districts[['Bezirk', 'geometry']], how='left') # Calculate centroids for analysis green_spaces['centroid'] = green_spaces.geometry.centroid green_spaces['centroid_lat'] = green_spaces.centroid.y green_spaces['centroid_lng'] = green_spaces.centroid.x print(f"Total green spaces found: {len(green_spaces)}") return green_spaces async def enhance_green_space_with_real_data(self, row): """Enhance a single green space with real tree and toilet data.""" try: lat = row['centroid_lat'] lng = row['centroid_lng'] area_sqm = int(row['area_sqm']) # Use existing tree service to get real tree data tree_response = await self.tree_service.get_trees_near_location( lat, lng, radius_m=min(400, int((area_sqm ** 0.5) * 1.5)) # Adaptive radius ) # Use existing toilet service to get real toilet data nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 800) # Calculate toilet accessibility score toilet_score = self._score_toilet_accessibility(nearby_toilets) # Map OSM type to our enum space_type = self._map_osm_to_space_type(row.get('fclass', '')) # Generate ID space_id = f"real_{row.get('fclass', 'unknown')}_{row.name}" # Create enhanced green space using real data enhanced_space = { "id": space_id, "name": row.get('name') or f"{row.get('fclass', 'Green Space').title()} in {row.get('Bezirk', 'Berlin')}", "description": f"Real Berlin {row.get('fclass', 'green space')} enhanced with tree and toilet data", "type": space_type, "coordinates": { "lat": float(lat), "lng": float(lng) }, "neighborhood": row.get('Bezirk', 'Unknown'), "area_sqm": area_sqm, "perimeter_m": int(row.geometry.length) if hasattr(row.geometry, 'length') else 0, # Environmental features using real tree data "environmental": { "tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)), "shade_quality": tree_response.shade_analysis.shade_quality_score, "noise_level": self._estimate_noise_level(row.get('fclass', ''), row.get('Bezirk', '')), "wildlife_diversity_score": tree_response.metrics.species_diversity_score, "water_features": 'water' in str(row.get('fclass', '')).lower() or 'river' in str(row.get('name', '')).lower(), "natural_surface_percent": self._estimate_natural_surface(row.get('fclass', '')) }, # Real tree metrics from existing service "tree_data": { "total_trees": tree_response.metrics.total_trees, "trees_per_hectare": tree_response.metrics.trees_per_hectare, "species_count": len(tree_response.metrics.dominant_species), "species_diversity_score": tree_response.metrics.species_diversity_score, "mature_trees_count": tree_response.metrics.mature_trees_count, "young_trees_count": tree_response.metrics.young_trees_count, "average_tree_age": tree_response.metrics.average_tree_age, "average_height": tree_response.metrics.average_height, "average_crown_diameter": tree_response.metrics.average_crown_diameter, "shade_coverage_percent": tree_response.metrics.shade_coverage_percent, "dominant_species": tree_response.metrics.dominant_species }, # Real toilet accessibility from existing service "toilet_accessibility": { "nearby_toilets_count": len(nearby_toilets), "accessibility_score": toilet_score, "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None, "free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]), "accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)]) }, # Standard accessibility features "accessibility": { "wheelchair_accessible": True, "public_transport_score": 3, # Could be enhanced with real transit data "cycling_infrastructure": area_sqm > 5000, "parking_availability": 2, "lighting_quality": 2 }, # Recreation features based on OSM data and size "recreation": { "playground_quality": self._estimate_playground_quality(row.get('fclass', ''), tree_response.metrics.total_trees), "sports_facilities": 'pitch' in str(row.get('fclass', '')).lower() or 'sport' in str(row.get('name', '')).lower(), "running_paths": area_sqm > 8000, "cycling_paths": area_sqm > 15000, "dog_friendly": True, "bbq_allowed": row.get('fclass') in ['park', 'recreation_ground'] and area_sqm > 5000 }, "last_updated": datetime.now().isoformat(), "data_sources": ["openstreetmap", "berlin_tree_cadastre", "berlin_toilets"], "confidence_score": 95 } return enhanced_space except Exception as e: print(f"Error enhancing green space {row.name}: {e}") return None def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int: """Score toilet accessibility using existing toilet data.""" if not nearby_toilets: return 20 nearest_distance = nearby_toilets[0]['distance_meters'] # Distance-based scoring if nearest_distance <= 200: score = 100 elif nearest_distance <= 400: score = 80 elif nearest_distance <= 600: score = 60 else: score = 40 # Bonuses for quality free_toilets = len([t for t in nearby_toilets if t.get('is_free', False)]) accessible_toilets = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)]) score += min(20, free_toilets * 5 + accessible_toilets * 3) return min(100, score) def _map_osm_to_space_type(self, fclass: str) -> str: """Map OSM feature class to green space types.""" mapping = { 'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN', 'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND', 'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK', 'wood': 'FOREST', 'heath': 'HEATH', 'pitch': 'SPORTS_AREA', 'golf_course': 'SPORTS_AREA', 'common': 'PARK', 'village_green': 'GRASS', 'allotments': 'GARDEN' } return mapping.get(fclass, 'PARK') def _estimate_noise_level(self, fclass: str, district: str) -> int: """Estimate noise level (1=very quiet, 5=very noisy).""" base_noise = { 'forest': 1, 'nature_reserve': 1, 'wood': 1, 'meadow': 2, 'grass': 2, 'heath': 2, 'park': 2, 'garden': 2, 'common': 2, 'recreation_ground': 3, 'playground': 3, 'pitch': 3, 'golf_course': 2, 'allotments': 2 } # Central districts are noisier central_districts = ['Mitte', 'Kreuzberg', 'Friedrichshain'] district_modifier = 1 if district in central_districts else 0 return min(5, base_noise.get(fclass, 2) + district_modifier) def _estimate_natural_surface(self, fclass: str) -> int: """Estimate percentage of natural surface.""" surface_map = { 'forest': 95, 'nature_reserve': 95, 'wood': 95, 'meadow': 95, 'grass': 90, 'heath': 90, 'park': 75, 'garden': 65, 'common': 80, 'recreation_ground': 60, 'playground': 40, 'pitch': 20, 'golf_course': 70, 'allotments': 85 } return surface_map.get(fclass, 70) def _estimate_playground_quality(self, fclass: str, tree_count: int) -> int: """Estimate playground quality score.""" base_scores = { 'playground': 85, 'park': 65, 'recreation_ground': 70, 'garden': 40, 'common': 50 } base = base_scores.get(fclass, 25) # Trees improve playground appeal for families tree_bonus = min(15, tree_count // 5) # +3 per 5 trees, max 15 return min(100, base + tree_bonus) async def process_all_green_spaces(self): """Process all green spaces with real data enhancement.""" print("Starting enhanced green space processing with real data...") # Load OSM green space boundaries osm_green_spaces = self.load_osm_green_spaces() enhanced_green_spaces = [] print(f"Enhancing {len(osm_green_spaces)} green spaces with real tree and toilet data...") # Process in batches to avoid overwhelming the system batch_size = 50 total_processed = 0 for i in range(0, len(osm_green_spaces), batch_size): batch = osm_green_spaces.iloc[i:i+batch_size] batch_results = [] for idx, row in batch.iterrows(): result = await self.enhance_green_space_with_real_data(row) if result: batch_results.append(result) total_processed += 1 if total_processed % 25 == 0: print(f"Processed {total_processed}/{len(osm_green_spaces)} green spaces...") enhanced_green_spaces.extend(batch_results) # Small delay between batches await asyncio.sleep(0.1) print(f"Successfully enhanced {len(enhanced_green_spaces)} green spaces with real data") return enhanced_green_spaces def save_enhanced_data(self, enhanced_green_spaces: List[Dict]): """Save enhanced green spaces to JSON file.""" output_file = self.processed_dir / "real_berlin_green_spaces.json" # Calculate summary statistics spaces_with_trees = len([gs for gs in enhanced_green_spaces if gs["tree_data"]["total_trees"] > 0]) spaces_with_toilets = len([gs for gs in enhanced_green_spaces if gs["toilet_accessibility"]["nearby_toilets_count"] > 0]) total_trees = sum(gs["tree_data"]["total_trees"] for gs in enhanced_green_spaces) avg_species_per_space = sum(gs["tree_data"]["species_count"] for gs in enhanced_green_spaces) / len(enhanced_green_spaces) if enhanced_green_spaces else 0 data = { "green_spaces": enhanced_green_spaces, "total_count": len(enhanced_green_spaces), "last_updated": datetime.now().isoformat(), "data_sources": [ "openstreetmap_boundaries", "berlin_tree_cadastre_via_service", "berlin_toilet_locations_via_service", "berlin_districts" ], "processing_info": { "script_version": "1.0", "coordinate_system": "WGS84", "uses_existing_services": True, "tree_analysis_via": "StreetTreeService", "toilet_analysis_via": "BerlinDataService" }, "summary_stats": { "spaces_with_trees": spaces_with_trees, "spaces_with_nearby_toilets": spaces_with_toilets, "total_trees_in_all_spaces": total_trees, "average_species_per_space": round(avg_species_per_space, 1), "coverage_percentage": { "with_tree_data": round((spaces_with_trees / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0, "with_toilet_data": round((spaces_with_toilets / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0 } } } with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"āœ… Saved {len(enhanced_green_spaces)} enhanced green spaces to {output_file}") print(f"šŸ“Š Summary:") print(f" - {spaces_with_trees} spaces have tree data ({round((spaces_with_trees/len(enhanced_green_spaces))*100, 1)}%)") print(f" - {spaces_with_toilets} spaces have nearby toilets ({round((spaces_with_toilets/len(enhanced_green_spaces))*100, 1)}%)") print(f" - {total_trees} total trees analyzed") print(f" - {avg_species_per_space:.1f} average species per space") return output_file async def main(): """Main processing function.""" processor = RealDataGreenSpaceProcessor() try: # Process enhanced green spaces using existing services enhanced_green_spaces = await processor.process_all_green_spaces() # Save enhanced data output_file = processor.save_enhanced_data(enhanced_green_spaces) print(f"\nšŸŽ‰ Successfully created real data enhanced Berlin green spaces!") print(f"šŸ“ Output: {output_file}") except KeyboardInterrupt: print("\nāš ļø Processing interrupted by user") except Exception as e: print(f"āŒ Error processing data: {e}") raise if __name__ == "__main__": asyncio.run(main())