Add real green space data and neighborhood filtering

2025-06-21 22:58:50 +02:00 · 2025-06-21 22:58:50 +02:00 · 49e3d8c29d
parent c14f5ead38
commit 49e3d8c29d
10 changed files with 934901 additions and 181 deletions
--- a/app/data/osm-raw/berlin_green_spaces.osm
+++ b/app/data/osm-raw/berlin_green_spaces.osm
--- a/app/routers/green_spaces.py
+++ b/app/routers/green_spaces.py
@ -185,3 +185,142 @@ async def get_current_conditions(
        return conditions
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to get conditions: {str(e)}")
@router.get("/all")
 async def get_all_green_spaces(
    personality: Optional[PersonalityType] = Query(None, description="Personality type for scoring"),
    min_score: int = Query(0, ge=0, le=100, description="Minimum personality score (only applies if personality is provided)"),
    limit: int = Query(50, ge=1, le=200, description="Maximum results"),
 ):
    """
    Get all available green spaces in Berlin.
    Optionally score them for a specific personality type.
    Perfect for frontend dropdowns or full dataset access.
    """
    try:
        # Get all green spaces
        all_spaces = await berlin_data.search_green_spaces()
        # If personality is specified, score and filter
        if personality:
            scored_spaces = []
            for space in all_spaces:
                personality_score = await green_space_service.scoring_engine.score_green_space(
                    space, personality.value
                )
                if personality_score.score >= min_score:
                    space.current_personality_score = personality_score
                    scored_spaces.append(space)
            # Sort by score (highest first)
            scored_spaces.sort(
                key=lambda x: x.current_personality_score.score if x.current_personality_score else 0,
                reverse=True
            )
            all_spaces = scored_spaces
        # Apply limit
        limited_spaces = all_spaces[:limit]
        return {
            "green_spaces": limited_spaces,
            "total_available": len(all_spaces),
            "returned_count": len(limited_spaces),
            "personality": personality.value if personality else None,
            "min_score_applied": min_score if personality else None
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to get green spaces: {str(e)}")
@router.get("/recommendations/{personality}")
 async def get_personality_recommendations(
    personality: PersonalityType,
    limit: int = Query(20, ge=1, le=50, description="Number of recommendations"),
    neighborhood: Optional[str] = Query(None, description="Preferred neighborhood"),
    min_score: int = Query(70, ge=50, le=100, description="Minimum personality score"),
 ):
    """
    Get personalized green space recommendations.
    Returns the best green spaces for a specific personality type,
    with explanations of why each space is recommended.
    """
    try:
        # Get all green spaces
        all_spaces = await berlin_data.search_green_spaces(neighborhood=neighborhood)
        # Score and rank for personality
        recommendations = []
        for space in all_spaces:
            personality_score = await green_space_service.scoring_engine.score_green_space(
                space, personality.value
            )
            if personality_score.score >= min_score:
                space.current_personality_score = personality_score
                # Get additional insights
                best_features = []
                if space.environmental.tree_coverage_percent > 70:
                    best_features.append("Excellent tree coverage")
                if space.environmental.water_features:
                    best_features.append("Water features")
                if space.recreation.playground_quality > 60:
                    best_features.append("Good playground facilities")
                if space.recreation.sports_facilities:
                    best_features.append("Sports facilities")
                if space.environmental.noise_level.value <= 2:
                    best_features.append("Peaceful atmosphere")
                recommendation = {
                    "green_space": space,
                    "score": personality_score.score,
                    "explanation": personality_score.explanation,
                    "best_features": best_features[:3],  # Top 3 features
                    "visit_recommendation": _get_visit_recommendation(space, personality.value)
                }
                recommendations.append(recommendation)
        # Sort by score
        recommendations.sort(key=lambda x: x["score"], reverse=True)
        return {
            "recommendations": recommendations[:limit],
            "personality": personality.value,
            "total_matches": len(recommendations),
            "search_filters": {
                "neighborhood": neighborhood,
                "min_score": min_score
            }
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to get recommendations: {str(e)}")
 def _get_visit_recommendation(space, personality: str) -> str:
    """Get a personalized visit recommendation"""
    if personality == "little_adventurers":
        if space.recreation.playground_quality > 60:
            return "Perfect for family adventures with great playground facilities"
        return "Great for exploring with kids"
    elif personality == "date_night":
        if space.environmental.noise_level.value <= 2:
            return "Romantic and peaceful setting for couples"
        return "Nice atmosphere for a romantic stroll"
    elif personality == "zen_masters":
        if space.environmental.tree_coverage_percent > 70:
            return "Ideal for peaceful meditation under the trees"
        return "Perfect for quiet contemplation"
    elif personality == "active_lifestyle":
        if space.recreation.sports_facilities:
            return "Great for workouts and active recreation"
        return "Perfect for running and outdoor activities"
    elif personality == "wildlife_lover":
        if space.environmental.wildlife_diversity_score > 70:
            return "Excellent biodiversity for nature observation"
        return "Good spot for wildlife watching"
    else:
        return "Highly recommended for your personality type"
--- a/app/services/berlin_data_service.py
+++ b/app/services/berlin_data_service.py
@ -20,6 +20,7 @@ class BerlinDataService:
        self.cache = {}
        self.last_refresh = None
        self._toilets_cache = None
        self._green_spaces_cache = None
        self._street_trees_index = None
        self.data_dir = Path("app/data")
        self.street_tree_service = StreetTreeService()
@ -46,9 +47,16 @@ class BerlinDataService:
                if distance > radius:
                    continue
-            # Apply neighborhood filter
+            # Apply neighborhood filter with flexible matching
-            if neighborhood and space.neighborhood.lower() != neighborhood.lower():
+            if neighborhood:
-                continue
+                neighborhood_lower = neighborhood.lower()
                space_neighborhood_lower = space.neighborhood.lower()
                # Check for exact match or partial match (useful for compound neighborhood names)
                if (neighborhood_lower != space_neighborhood_lower and 
                    neighborhood_lower not in space_neighborhood_lower and 
                    space_neighborhood_lower not in neighborhood_lower):
                    continue
            # Apply other filters
            if filters:
@ -214,43 +222,60 @@ class BerlinDataService:
    async def get_neighborhood_stats(self) -> Dict[str, Any]:
        """Get statistics for Berlin neighborhoods."""
-        return {
+        # Get all green spaces to calculate real neighborhood stats
-            "neighborhoods": [
+        green_spaces = await self._get_mock_green_spaces()
-                {
+        
-                    "name": "mitte",
+        # Count green spaces per neighborhood
-                    "display_name": "Mitte",
+        neighborhood_counts = {}
-                    "green_space_count": 15,
+        neighborhood_spaces = {}
-                    "avg_personality_scores": {
+        
-                        "little_adventurers": 75,
+        for space in green_spaces:
-                        "date_night": 80,
+            neighborhood = space.neighborhood
-                        "squad_goals": 70,
+            if neighborhood not in neighborhood_counts:
-                        "zen_masters": 65
+                neighborhood_counts[neighborhood] = 0
-                    }
+                neighborhood_spaces[neighborhood] = []
-                },
+            neighborhood_counts[neighborhood] += 1
-                {
+            neighborhood_spaces[neighborhood].append(space)
-                    "name": "kreuzberg",
+        
-                    "display_name": "Kreuzberg",
+        # Generate neighborhood stats
-                    "green_space_count": 12,
+        neighborhoods = []
-                    "avg_personality_scores": {
+        for neighborhood, count in neighborhood_counts.items():
-                        "little_adventurers": 70,
+            # Calculate average personality scores based on green space features
-                        "date_night": 75,
+            spaces = neighborhood_spaces[neighborhood]
-                        "squad_goals": 85,
+            
-                        "zen_masters": 60
+            # Calculate scores based on actual features
-                    }
+            total_tree_coverage = sum(s.environmental.tree_coverage_percent for s in spaces)
-                },
+            total_playgrounds = sum(s.recreation.playground_quality for s in spaces)
-                {
+            total_water_features = sum(1 for s in spaces if s.environmental.water_features)
-                    "name": "prenzlauer_berg",
+            total_sports = sum(1 for s in spaces if s.recreation.sports_facilities)
-                    "display_name": "Prenzlauer Berg",
+            
-                    "green_space_count": 18,
+            avg_tree_coverage = total_tree_coverage / count if count > 0 else 0
-                    "avg_personality_scores": {
+            avg_playground = total_playgrounds / count if count > 0 else 0
-                        "little_adventurers": 90,
+            water_ratio = total_water_features / count if count > 0 else 0
-                        "date_night": 70,
+            sports_ratio = total_sports / count if count > 0 else 0
-                        "squad_goals": 75,
+            
-                        "zen_masters": 70
+            # Calculate personality scores based on features
-                    }
+            little_adventurers = min(100, int(avg_playground * 0.8 + sports_ratio * 30 + 40))
            date_night = min(100, int(avg_tree_coverage * 0.6 + water_ratio * 25 + 45))
            squad_goals = min(100, int(sports_ratio * 40 + avg_tree_coverage * 0.4 + 35))
            zen_masters = min(100, int(avg_tree_coverage * 0.7 + water_ratio * 20 + 30))
            neighborhoods.append({
                "name": neighborhood.lower().replace(' ', '_').replace('-', '_'),
                "display_name": neighborhood,
                "green_space_count": count,
                "avg_personality_scores": {
                    "little_adventurers": little_adventurers,
                    "date_night": date_night,
                    "squad_goals": squad_goals,
                    "zen_masters": zen_masters
                }
-            ]
+            })
-        }
+        
        # Sort by green space count (most spaces first)
        neighborhoods.sort(key=lambda x: x["green_space_count"], reverse=True)
        return {"neighborhoods": neighborhoods}
    async def get_current_conditions(self, lat: float, lng: float) -> Dict[str, Any]:
        """Get current conditions at a location."""
@ -394,122 +419,76 @@ class BerlinDataService:
            # Return original space if enhancement fails
            return green_space
    def _load_green_spaces(self) -> List[Dict]:
        """Load green spaces data from JSON file"""
        if self._green_spaces_cache is None:
            green_spaces_file = self.data_dir / "processed" / "quick_berlin_green_spaces.json"
            if green_spaces_file.exists():
                with open(green_spaces_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    self._green_spaces_cache = data.get("green_spaces", [])
            else:
                print("Warning: quick_berlin_green_spaces.json not found.")
                self._green_spaces_cache = []
        return self._green_spaces_cache
    def _convert_json_to_green_space(self, json_data: Dict) -> GreenSpace:
        """Convert JSON data to GreenSpace model"""
        from datetime import datetime
        return GreenSpace(
            id=json_data.get("id", ""),
            name=json_data.get("name", ""),
            description=json_data.get("description", ""),
            type=GreenSpaceType.PARK,  # Default to PARK, could be enhanced later
            coordinates=Coordinates(
                lat=json_data.get("coordinates", {}).get("lat", 0.0),
                lng=json_data.get("coordinates", {}).get("lng", 0.0)
            ),
            neighborhood=json_data.get("neighborhood", "Berlin"),
            address=f"{json_data.get('name', 'Unknown')}, Berlin",
            area_sqm=json_data.get("area_sqm", 0),
            perimeter_m=json_data.get("perimeter_m", None),
            environmental=EnvironmentalFeatures(
                tree_coverage_percent=json_data.get("environmental", {}).get("tree_coverage_percent", 0),
                shade_quality=json_data.get("environmental", {}).get("shade_quality", 0),
                noise_level=NoiseLevel(json_data.get("environmental", {}).get("noise_level", 1)),
                wildlife_diversity_score=json_data.get("environmental", {}).get("wildlife_diversity_score", 0),
                water_features=json_data.get("environmental", {}).get("water_features", False),
                natural_surface_percent=json_data.get("environmental", {}).get("natural_surface_percent", 0)
            ),
            accessibility=AccessibilityFeatures(
                wheelchair_accessible=json_data.get("accessibility", {}).get("wheelchair_accessible", True),
                public_transport_score=json_data.get("accessibility", {}).get("public_transport_score", 3),
                cycling_infrastructure=json_data.get("accessibility", {}).get("cycling_infrastructure", True),
                parking_availability=json_data.get("accessibility", {}).get("parking_availability", 2),
                lighting_quality=json_data.get("accessibility", {}).get("lighting_quality", 3)
            ),
            recreation=RecreationFeatures(
                playground_quality=json_data.get("recreation", {}).get("playground_quality", 0),
                sports_facilities=json_data.get("recreation", {}).get("sports_facilities", False),
                running_paths=json_data.get("recreation", {}).get("running_paths", True),
                cycling_paths=json_data.get("recreation", {}).get("cycling_paths", True),
                dog_friendly=json_data.get("recreation", {}).get("dog_friendly", True),
                bbq_allowed=json_data.get("recreation", {}).get("bbq_allowed", False)
            ),
            nearby_amenities=[],
            last_updated=datetime.fromisoformat(json_data.get("last_updated", datetime.now().isoformat())),
            data_sources=json_data.get("data_sources", []),
            confidence_score=json_data.get("confidence_score", 85)
        )
    async def _get_mock_green_spaces(self) -> List[GreenSpace]:
-        """Get mock green spaces data for development."""
+        """Get green spaces data from JSON file."""
-        # This would be replaced with real data fetching in production
+        json_data = self._load_green_spaces()
-        return [
+        green_spaces = []
-            GreenSpace(
+        
-                id="tiergarten_1",
+        for space_data in json_data:
-                name="Tiergarten",
+            try:
-                description="Berlin's most famous park in the heart of the city",
+                green_space = self._convert_json_to_green_space(space_data)
-                type=GreenSpaceType.PARK,
+                green_spaces.append(green_space)
-                coordinates=Coordinates(lat=52.5145, lng=13.3501),
+            except Exception as e:
-                neighborhood="Mitte",
+                print(f"Error converting green space {space_data.get('id', 'unknown')}: {e}")
-                address="Tiergarten, 10557 Berlin",
+                continue
-                area_sqm=210000,
+        
-                perimeter_m=5800,
+        return green_spaces
                environmental=EnvironmentalFeatures(
                    tree_coverage_percent=85,
                    shade_quality=90,
                    noise_level=NoiseLevel.MODERATE,
                    wildlife_diversity_score=80,
                    water_features=True,
                    natural_surface_percent=95
                ),
                accessibility=AccessibilityFeatures(
                    wheelchair_accessible=True,
                    public_transport_score=5,
                    cycling_infrastructure=True,
                    parking_availability=3,
                    lighting_quality=4
                ),
                recreation=RecreationFeatures(
                    playground_quality=70,
                    sports_facilities=True,
                    running_paths=True,
                    cycling_paths=True,
                    dog_friendly=True,
                    bbq_allowed=False
                ),
                nearby_amenities=[],
                last_updated=datetime.now(),
                data_sources=["berlin_open_data", "osm"],
                confidence_score=95
            ),
            GreenSpace(
                id="volkspark_friedrichshain",
                name="Volkspark Friedrichshain",
                description="Historic park with fairy tale fountain and sports facilities",
                type=GreenSpaceType.PARK,
                coordinates=Coordinates(lat=52.5263, lng=13.4317),
                neighborhood="Friedrichshain",
                address="Friedrichshain, 10249 Berlin",
                area_sqm=49000,
                perimeter_m=2800,
                environmental=EnvironmentalFeatures(
                    tree_coverage_percent=70,
                    shade_quality=75,
                    noise_level=NoiseLevel.QUIET,
                    wildlife_diversity_score=65,
                    water_features=True,
                    natural_surface_percent=80
                ),
                accessibility=AccessibilityFeatures(
                    wheelchair_accessible=True,
                    public_transport_score=4,
                    cycling_infrastructure=True,
                    parking_availability=2,
                    lighting_quality=3
                ),
                recreation=RecreationFeatures(
                    playground_quality=85,
                    sports_facilities=True,
                    running_paths=True,
                    cycling_paths=True,
                    dog_friendly=True,
                    bbq_allowed=True
                ),
                nearby_amenities=[],
                last_updated=datetime.now(),
                data_sources=["berlin_open_data", "osm"],
                confidence_score=90
            ),
            GreenSpace(
                id="tempelhofer_feld",
                name="Tempelhofer Feld",
                description="Former airport turned into unique urban park",
                type=GreenSpaceType.PARK,
                coordinates=Coordinates(lat=52.4732, lng=13.4015),
                neighborhood="Tempelhof",
                address="Tempelhofer Damm, 12101 Berlin",
                area_sqm=300000,
                perimeter_m=6200,
                environmental=EnvironmentalFeatures(
                    tree_coverage_percent=15,
                    shade_quality=20,
                    noise_level=NoiseLevel.MODERATE,
                    wildlife_diversity_score=40,
                    water_features=False,
                    natural_surface_percent=60
                ),
                accessibility=AccessibilityFeatures(
                    wheelchair_accessible=True,
                    public_transport_score=4,
                    cycling_infrastructure=True,
                    parking_availability=4,
                    lighting_quality=2
                ),
                recreation=RecreationFeatures(
                    playground_quality=30,
                    sports_facilities=False,
                    running_paths=True,
                    cycling_paths=True,
                    dog_friendly=True,
                    bbq_allowed=True
                ),
                nearby_amenities=[],
                last_updated=datetime.now(),
                data_sources=["berlin_open_data", "osm"],
                confidence_score=85
            )
        ]
--- a/app/services/street_tree_service.py
+++ b/app/services/street_tree_service.py
@ -4,6 +4,10 @@ from pathlib import Path
 from typing import List, Optional, Tuple, Dict, Any
 from datetime import datetime
 from geopy.distance import geodesic
 from rtree import index
 import asyncio
 import aiofiles
 from functools import lru_cache
 from app.models.street_tree import (
    StreetTree, TreeDensityMetrics, TreeShadeAnalysis, TreesSearchFilters,
@ -14,24 +18,58 @@ from app.models.green_space import Coordinates
 class StreetTreeService:
    """Service for accessing and analyzing Berlin street trees data."""
-    def __init__(self):
+    _instance = None
-        self._trees_cache = None
+    _initialized = False
        self._trees_index = None
        self.data_dir = Path("app/data")
-    def _load_trees(self) -> List[Dict]:
+    def __new__(cls):
-        """Load street trees data from JSON file."""
+        if cls._instance is None:
            cls._instance = super().__new__(cls)
        return cls._instance
    def __init__(self):
        if not self._initialized:
            self._trees_cache = None
            self._spatial_index = None
            self._tree_id_to_data = {}
            self.data_dir = Path("app/data")
            self.__class__._initialized = True
    async def _load_trees(self) -> List[Dict]:
        """Load street trees data from JSON file and build spatial index."""
        if self._trees_cache is None:
            trees_file = self.data_dir / "processed" / "street_trees.json"
            if trees_file.exists():
-                with open(trees_file, 'r', encoding='utf-8') as f:
+                print("🔄 Loading trees data and building spatial index...")
-                    data = json.load(f)
+                async with aiofiles.open(trees_file, 'r', encoding='utf-8') as f:
                    content = await f.read()
                    data = json.loads(content)
                    self._trees_cache = data.get("street_trees", [])
                    await self._build_spatial_index()
                    print(f"✅ Loaded {len(self._trees_cache)} trees with spatial index")
            else:
                print("Warning: street_trees.json not found. Run process_street_trees.py first.")
                self._trees_cache = []
        return self._trees_cache
    async def _build_spatial_index(self):
        """Build R-tree spatial index for fast location queries."""
        if self._spatial_index is None and self._trees_cache:
            print("🔨 Building spatial index...")
            self._spatial_index = index.Index()
            self._tree_id_to_data = {}
            for i, tree_data in enumerate(self._trees_cache):
                lat = tree_data.get('lat')
                lng = tree_data.get('lng')
                if lat is not None and lng is not None:
                    # R-tree expects (minx, miny, maxx, maxy)
                    bbox = (lng, lat, lng, lat)
                    self._spatial_index.insert(i, bbox)
                    self._tree_id_to_data[i] = tree_data
            print(f"✅ Spatial index built for {len(self._tree_id_to_data)} trees")
    def _create_tree_from_dict(self, tree_data: Dict) -> StreetTree:
        """Convert tree dictionary to StreetTree model."""
@ -94,6 +132,11 @@ class StreetTreeService:
            last_updated=datetime.now()
        )
    @lru_cache(maxsize=1000)
    def _distance_cache(self, lat1: float, lng1: float, lat2: float, lng2: float) -> float:
        """Cache distance calculations."""
        return geodesic((lat1, lng1), (lat2, lng2)).meters
    async def get_trees_near_location(
        self,
        lat: float,
@ -101,31 +144,48 @@ class StreetTreeService:
        radius_m: int = 500,
        limit: Optional[int] = None
    ) -> TreesNearLocationResponse:
-        """Get street trees within a radius of a location."""
+        """Get street trees within a radius of a location using spatial index."""
        start_time = datetime.now()
-        trees_data = self._load_trees()
+        await self._load_trees()
        nearby_trees = []
-        for tree_data in trees_data:
+        if self._spatial_index is None:
            # Fallback to linear search if index failed
            return await self._get_trees_linear_search(lat, lng, radius_m, limit)
        # Convert radius to approximate bounding box for R-tree query
        # Rough approximation: 1 degree ≈ 111km
        radius_deg = radius_m / 111000
        bbox = (lng - radius_deg, lat - radius_deg, lng + radius_deg, lat + radius_deg)
        # Query spatial index for candidates
        candidate_ids = list(self._spatial_index.intersection(bbox))
        # Filter candidates by exact distance
        tree_distances = []
        for tree_id in candidate_ids:
            tree_data = self._tree_id_to_data.get(tree_id)
            if not tree_data:
                continue
            tree_lat = tree_data.get('lat')
            tree_lng = tree_data.get('lng')
            if tree_lat is None or tree_lng is None:
                continue
-            distance = geodesic((lat, lng), (tree_lat, tree_lng)).meters
+            distance = self._distance_cache(lat, lng, tree_lat, tree_lng)
            if distance <= radius_m:
                tree = self._create_tree_from_dict(tree_data)
-                nearby_trees.append(tree)
+                tree_distances.append((tree, distance))
-                if limit and len(nearby_trees) >= limit:
+                if limit and len(tree_distances) >= limit:
                    break
        # Sort by distance
-        nearby_trees.sort(
+        tree_distances.sort(key=lambda x: x[1])
-            key=lambda t: geodesic((lat, lng), (t.coordinates.lat, t.coordinates.lng)).meters
+        nearby_trees = [tree for tree, _ in tree_distances]
        )
        # Calculate metrics
        metrics = self._calculate_tree_density_metrics(nearby_trees, radius_m)
@ -212,7 +272,7 @@ class StreetTreeService:
        large_trees = []
        for tree in trees:
-            distance = geodesic((lat, lng), (tree.coordinates.lat, tree.coordinates.lng)).meters
+            distance = self._distance_cache(lat, lng, tree.coordinates.lat, tree.coordinates.lng)
            if distance <= 50:
                trees_50m += 1
@ -259,9 +319,58 @@ class StreetTreeService:
            canopy_density=len(large_trees) / max(1, len(trees)) if trees else 0
        )
    async def _get_trees_linear_search(
        self,
        lat: float,
        lng: float,
        radius_m: int = 500,
        limit: Optional[int] = None
    ) -> TreesNearLocationResponse:
        """Fallback linear search method."""
        start_time = datetime.now()
        trees_data = await self._load_trees()
        nearby_trees = []
        for tree_data in trees_data:
            tree_lat = tree_data.get('lat')
            tree_lng = tree_data.get('lng')
            if tree_lat is None or tree_lng is None:
                continue
            distance = self._distance_cache(lat, lng, tree_lat, tree_lng)
            if distance <= radius_m:
                tree = self._create_tree_from_dict(tree_data)
                nearby_trees.append(tree)
                if limit and len(nearby_trees) >= limit:
                    break
        # Sort by distance
        nearby_trees.sort(
            key=lambda t: self._distance_cache(lat, lng, t.coordinates.lat, t.coordinates.lng)
        )
        # Calculate metrics
        metrics = self._calculate_tree_density_metrics(nearby_trees, radius_m)
        shade_analysis = self._analyze_shade_coverage(lat, lng, nearby_trees)
        query_time = (datetime.now() - start_time).total_seconds() * 1000
        return TreesNearLocationResponse(
            location=Coordinates(lat=lat, lng=lng),
            radius_m=radius_m,
            trees=nearby_trees,
            metrics=metrics,
            shade_analysis=shade_analysis,
            total_found=len(nearby_trees),
            query_time_ms=int(query_time)
        )
    async def search_trees(self, filters: TreesSearchFilters) -> List[StreetTree]:
        """Search trees with filters."""
-        trees_data = self._load_trees()
+        trees_data = await self._load_trees()
        filtered_trees = []
        for tree_data in trees_data:
@ -272,10 +381,10 @@ class StreetTreeService:
                if tree_lat is None or tree_lng is None:
                    continue
-                distance = geodesic(
+                distance = self._distance_cache(
-                    (filters.center_lat, filters.center_lng),
+                    filters.center_lat, filters.center_lng,
-                    (tree_lat, tree_lng)
+                    tree_lat, tree_lng
-                ).meters
+                )
                if distance > filters.within_radius_m:
                    continue
@ -305,7 +414,7 @@ class StreetTreeService:
    async def get_tree_stats(self) -> Dict[str, Any]:
        """Get overall statistics about Berlin street trees."""
-        trees_data = self._load_trees()
+        trees_data = await self._load_trees()
        if not trees_data:
            return {"error": "No tree data available"}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -35,6 +35,8 @@ dependencies = [
    "redis>=5.0.0",
    "aiofiles>=23.2.0",
    "openpyxl>=3.1.5",
    "tqdm>=4.67.1",
    "rtree>=1.4.0",
 ]
 [project.optional-dependencies]
--- a/scripts/enhance_green_spaces_with_real_data.py
+++ b/scripts/enhance_green_spaces_with_real_data.py
@ -0,0 +1,467 @@
 #!/usr/bin/env python3
 """
 Enhanced Berlin green space processor using existing tree and toilet services.
 Downloads OSM green space boundaries and enhances them with real data using existing services.
 """
 import os
 import json
 import zipfile
 import requests
 import asyncio
 from pathlib import Path
 import geopandas as gpd
 import pandas as pd
 from datetime import datetime
 from typing import List, Dict, Optional
 import sys
 # Add the app directory to Python path to import services
 sys.path.append(str(Path(__file__).parent.parent))
 from app.services.street_tree_service import StreetTreeService
 from app.services.berlin_data_service import BerlinDataService
 class RealDataGreenSpaceProcessor:
    def __init__(self, data_dir: str = "app/data"):
        self.data_dir = Path(data_dir)
        self.raw_dir = self.data_dir / "geo-raw"
        self.processed_dir = self.data_dir / "processed"
        # Create directories
        self.raw_dir.mkdir(parents=True, exist_ok=True)
        self.processed_dir.mkdir(parents=True, exist_ok=True)
        # Initialize existing services
        self.tree_service = StreetTreeService()
        self.berlin_data = BerlinDataService()
    def download_berlin_districts(self):
        """Download Berlin district boundaries."""
        json_file = self.raw_dir / "bezirksgrenzen.geojson"
        if json_file.exists():
            print(f"Berlin district data already exists: {json_file}")
            return json_file
        link = "https://tsb-opendata.s3.eu-central-1.amazonaws.com/bezirksgrenzen/bezirksgrenzen.geojson"
        print(f"Downloading Berlin district data from {link}")
        try:
            response = requests.get(link, timeout=30)
            response.raise_for_status()
            with open(json_file, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded to {json_file}")
            return json_file
        except Exception as e:
            print(f"Error downloading districts: {e}")
            raise
    def download_osm_data(self):
        """Download Berlin OpenStreetMap data."""
        zip_file = self.raw_dir / "berlin_shapes.zip"
        shp_dir = self.raw_dir / "berlin_shapes"
        # Check if already extracted
        required_files = ["gis_osm_landuse_a_free_1.shp", "gis_osm_natural_a_free_1.shp", "gis_osm_leisure_a_free_1.shp"]
        if all((shp_dir / f).exists() for f in required_files):
            print(f"Berlin OSM data already exists: {shp_dir}")
            return shp_dir
        if not zip_file.exists():
            link = "https://download.geofabrik.de/europe/germany/berlin-latest-free.shp.zip"
            print(f"Downloading Berlin OSM data from {link} (this may take several minutes...)")
            try:
                response = requests.get(link, stream=True, timeout=300)  # 5 min timeout
                response.raise_for_status()
                with open(zip_file, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                print(f"Download completed: {zip_file}")
            except Exception as e:
                print(f"Error downloading OSM data: {e}")
                raise
        print(f"Extracting Berlin OSM data to {shp_dir}")
        try:
            with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                zip_ref.extractall(shp_dir)
            print(f"Extracted to {shp_dir}")
        except Exception as e:
            print(f"Error extracting OSM data: {e}")
            raise
        return shp_dir
    def load_osm_green_spaces(self):
        """Load OSM green space polygons."""
        print("Loading OSM green space boundaries...")
        # Download required data
        districts_file = self.download_berlin_districts()
        shp_dir = self.download_osm_data()
        # Load Berlin districts for clipping
        districts = gpd.read_file(districts_file)
        # Define green space categories we want
        green_categories = {
            'landuse': ['forest', 'grass', 'meadow', 'recreation_ground', 'village_green', 'allotments'],
            'natural': ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood'],
            'leisure': ['park', 'garden', 'nature_reserve', 'playground', 'pitch', 'common', 'golf_course']
        }
        all_green_spaces = []
        # Process each category
        for category, subcategories in green_categories.items():
            shapefile = shp_dir / f"gis_osm_{category}_a_free_1.shp"
            if not shapefile.exists():
                print(f"Warning: {shapefile} not found, skipping")
                continue
            print(f"Processing {category} data...")
            try:
                gdf = gpd.read_file(shapefile)
                # Filter to relevant subcategories
                gdf_filtered = gdf[gdf['fclass'].isin(subcategories)].copy()
                if len(gdf_filtered) == 0:
                    print(f"No {category} features found in subcategories")
                    continue
                # Clip to Berlin boundaries
                gdf_clipped = gpd.clip(gdf_filtered, districts)
                # Calculate area and filter out very small areas (< 1000 sqm)
                gdf_clipped['area_sqm'] = gdf_clipped.geometry.area
                gdf_clipped = gdf_clipped[gdf_clipped['area_sqm'] >= 1000]
                if len(gdf_clipped) > 0:
                    all_green_spaces.append(gdf_clipped)
                    print(f"Found {len(gdf_clipped)} {category} features")
            except Exception as e:
                print(f"Error processing {category}: {e}")
                continue
        if not all_green_spaces:
            raise ValueError("No green space data found")
        # Combine all green spaces
        green_spaces = gpd.GeoDataFrame(pd.concat(all_green_spaces, ignore_index=True))
        # Add district information
        green_spaces = gpd.sjoin(green_spaces, districts[['Bezirk', 'geometry']], how='left')
        # Calculate centroids for analysis
        green_spaces['centroid'] = green_spaces.geometry.centroid
        green_spaces['centroid_lat'] = green_spaces.centroid.y
        green_spaces['centroid_lng'] = green_spaces.centroid.x
        print(f"Total green spaces found: {len(green_spaces)}")
        return green_spaces
    async def enhance_green_space_with_real_data(self, row):
        """Enhance a single green space with real tree and toilet data."""
        try:
            lat = row['centroid_lat']
            lng = row['centroid_lng']
            area_sqm = int(row['area_sqm'])
            # Use existing tree service to get real tree data
            tree_response = await self.tree_service.get_trees_near_location(
                lat, lng, radius_m=min(400, int((area_sqm ** 0.5) * 1.5))  # Adaptive radius
            )
            # Use existing toilet service to get real toilet data  
            nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 800)
            # Calculate toilet accessibility score
            toilet_score = self._score_toilet_accessibility(nearby_toilets)
            # Map OSM type to our enum
            space_type = self._map_osm_to_space_type(row.get('fclass', ''))
            # Generate ID
            space_id = f"real_{row.get('fclass', 'unknown')}_{row.name}"
            # Create enhanced green space using real data
            enhanced_space = {
                "id": space_id,
                "name": row.get('name') or f"{row.get('fclass', 'Green Space').title()} in {row.get('Bezirk', 'Berlin')}",
                "description": f"Real Berlin {row.get('fclass', 'green space')} enhanced with tree and toilet data",
                "type": space_type,
                "coordinates": {
                    "lat": float(lat),
                    "lng": float(lng)
                },
                "neighborhood": row.get('Bezirk', 'Unknown'),
                "area_sqm": area_sqm,
                "perimeter_m": int(row.geometry.length) if hasattr(row.geometry, 'length') else 0,
                # Environmental features using real tree data
                "environmental": {
                    "tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
                    "shade_quality": tree_response.shade_analysis.shade_quality_score,
                    "noise_level": self._estimate_noise_level(row.get('fclass', ''), row.get('Bezirk', '')),
                    "wildlife_diversity_score": tree_response.metrics.species_diversity_score,
                    "water_features": 'water' in str(row.get('fclass', '')).lower() or 'river' in str(row.get('name', '')).lower(),
                    "natural_surface_percent": self._estimate_natural_surface(row.get('fclass', ''))
                },
                # Real tree metrics from existing service
                "tree_data": {
                    "total_trees": tree_response.metrics.total_trees,
                    "trees_per_hectare": tree_response.metrics.trees_per_hectare,
                    "species_count": len(tree_response.metrics.dominant_species),
                    "species_diversity_score": tree_response.metrics.species_diversity_score,
                    "mature_trees_count": tree_response.metrics.mature_trees_count,
                    "young_trees_count": tree_response.metrics.young_trees_count,
                    "average_tree_age": tree_response.metrics.average_tree_age,
                    "average_height": tree_response.metrics.average_height,
                    "average_crown_diameter": tree_response.metrics.average_crown_diameter,
                    "shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
                    "dominant_species": tree_response.metrics.dominant_species
                },
                # Real toilet accessibility from existing service
                "toilet_accessibility": {
                    "nearby_toilets_count": len(nearby_toilets),
                    "accessibility_score": toilet_score,
                    "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
                    "free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
                    "accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
                },
                # Standard accessibility features
                "accessibility": {
                    "wheelchair_accessible": True,
                    "public_transport_score": 3,  # Could be enhanced with real transit data
                    "cycling_infrastructure": area_sqm > 5000,
                    "parking_availability": 2,
                    "lighting_quality": 2
                },
                # Recreation features based on OSM data and size
                "recreation": {
                    "playground_quality": self._estimate_playground_quality(row.get('fclass', ''), tree_response.metrics.total_trees),
                    "sports_facilities": 'pitch' in str(row.get('fclass', '')).lower() or 'sport' in str(row.get('name', '')).lower(),
                    "running_paths": area_sqm > 8000,
                    "cycling_paths": area_sqm > 15000,
                    "dog_friendly": True,
                    "bbq_allowed": row.get('fclass') in ['park', 'recreation_ground'] and area_sqm > 5000
                },
                "last_updated": datetime.now().isoformat(),
                "data_sources": ["openstreetmap", "berlin_tree_cadastre", "berlin_toilets"],
                "confidence_score": 95
            }
            return enhanced_space
        except Exception as e:
            print(f"Error enhancing green space {row.name}: {e}")
            return None
    def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
        """Score toilet accessibility using existing toilet data."""
        if not nearby_toilets:
            return 20
        nearest_distance = nearby_toilets[0]['distance_meters']
        # Distance-based scoring
        if nearest_distance <= 200:
            score = 100
        elif nearest_distance <= 400:
            score = 80
        elif nearest_distance <= 600:
            score = 60
        else:
            score = 40
        # Bonuses for quality
        free_toilets = len([t for t in nearby_toilets if t.get('is_free', False)])
        accessible_toilets = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
        score += min(20, free_toilets * 5 + accessible_toilets * 3)
        return min(100, score)
    def _map_osm_to_space_type(self, fclass: str) -> str:
        """Map OSM feature class to green space types."""
        mapping = {
            'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN',
            'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
            'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
            'wood': 'FOREST', 'heath': 'HEATH', 'pitch': 'SPORTS_AREA',
            'golf_course': 'SPORTS_AREA', 'common': 'PARK', 'village_green': 'GRASS',
            'allotments': 'GARDEN'
        }
        return mapping.get(fclass, 'PARK')
    def _estimate_noise_level(self, fclass: str, district: str) -> int:
        """Estimate noise level (1=very quiet, 5=very noisy)."""
        base_noise = {
            'forest': 1, 'nature_reserve': 1, 'wood': 1,
            'meadow': 2, 'grass': 2, 'heath': 2,
            'park': 2, 'garden': 2, 'common': 2,
            'recreation_ground': 3, 'playground': 3, 'pitch': 3,
            'golf_course': 2, 'allotments': 2
        }
        # Central districts are noisier
        central_districts = ['Mitte', 'Kreuzberg', 'Friedrichshain']
        district_modifier = 1 if district in central_districts else 0
        return min(5, base_noise.get(fclass, 2) + district_modifier)
    def _estimate_natural_surface(self, fclass: str) -> int:
        """Estimate percentage of natural surface."""
        surface_map = {
            'forest': 95, 'nature_reserve': 95, 'wood': 95,
            'meadow': 95, 'grass': 90, 'heath': 90,
            'park': 75, 'garden': 65, 'common': 80,
            'recreation_ground': 60, 'playground': 40, 'pitch': 20,
            'golf_course': 70, 'allotments': 85
        }
        return surface_map.get(fclass, 70)
    def _estimate_playground_quality(self, fclass: str, tree_count: int) -> int:
        """Estimate playground quality score."""
        base_scores = {
            'playground': 85,
            'park': 65,
            'recreation_ground': 70,
            'garden': 40,
            'common': 50
        }
        base = base_scores.get(fclass, 25)
        # Trees improve playground appeal for families
        tree_bonus = min(15, tree_count // 5)  # +3 per 5 trees, max 15
        return min(100, base + tree_bonus)
    async def process_all_green_spaces(self):
        """Process all green spaces with real data enhancement."""
        print("Starting enhanced green space processing with real data...")
        # Load OSM green space boundaries
        osm_green_spaces = self.load_osm_green_spaces()
        enhanced_green_spaces = []
        print(f"Enhancing {len(osm_green_spaces)} green spaces with real tree and toilet data...")
        # Process in batches to avoid overwhelming the system
        batch_size = 50
        total_processed = 0
        for i in range(0, len(osm_green_spaces), batch_size):
            batch = osm_green_spaces.iloc[i:i+batch_size]
            batch_results = []
            for idx, row in batch.iterrows():
                result = await self.enhance_green_space_with_real_data(row)
                if result:
                    batch_results.append(result)
                total_processed += 1
                if total_processed % 25 == 0:
                    print(f"Processed {total_processed}/{len(osm_green_spaces)} green spaces...")
            enhanced_green_spaces.extend(batch_results)
            # Small delay between batches
            await asyncio.sleep(0.1)
        print(f"Successfully enhanced {len(enhanced_green_spaces)} green spaces with real data")
        return enhanced_green_spaces
    def save_enhanced_data(self, enhanced_green_spaces: List[Dict]):
        """Save enhanced green spaces to JSON file."""
        output_file = self.processed_dir / "real_berlin_green_spaces.json"
        # Calculate summary statistics
        spaces_with_trees = len([gs for gs in enhanced_green_spaces if gs["tree_data"]["total_trees"] > 0])
        spaces_with_toilets = len([gs for gs in enhanced_green_spaces if gs["toilet_accessibility"]["nearby_toilets_count"] > 0])
        total_trees = sum(gs["tree_data"]["total_trees"] for gs in enhanced_green_spaces)
        avg_species_per_space = sum(gs["tree_data"]["species_count"] for gs in enhanced_green_spaces) / len(enhanced_green_spaces) if enhanced_green_spaces else 0
        data = {
            "green_spaces": enhanced_green_spaces,
            "total_count": len(enhanced_green_spaces),
            "last_updated": datetime.now().isoformat(),
            "data_sources": [
                "openstreetmap_boundaries",
                "berlin_tree_cadastre_via_service", 
                "berlin_toilet_locations_via_service",
                "berlin_districts"
            ],
            "processing_info": {
                "script_version": "1.0",
                "coordinate_system": "WGS84",
                "uses_existing_services": True,
                "tree_analysis_via": "StreetTreeService",
                "toilet_analysis_via": "BerlinDataService"
            },
            "summary_stats": {
                "spaces_with_trees": spaces_with_trees,
                "spaces_with_nearby_toilets": spaces_with_toilets,
                "total_trees_in_all_spaces": total_trees,
                "average_species_per_space": round(avg_species_per_space, 1),
                "coverage_percentage": {
                    "with_tree_data": round((spaces_with_trees / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0,
                    "with_toilet_data": round((spaces_with_toilets / len(enhanced_green_spaces)) * 100, 1) if enhanced_green_spaces else 0
                }
            }
        }
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"✅ Saved {len(enhanced_green_spaces)} enhanced green spaces to {output_file}")
        print(f"📊 Summary:")
        print(f"   - {spaces_with_trees} spaces have tree data ({round((spaces_with_trees/len(enhanced_green_spaces))*100, 1)}%)")
        print(f"   - {spaces_with_toilets} spaces have nearby toilets ({round((spaces_with_toilets/len(enhanced_green_spaces))*100, 1)}%)")
        print(f"   - {total_trees} total trees analyzed")
        print(f"   - {avg_species_per_space:.1f} average species per space")
        return output_file
 async def main():
    """Main processing function."""
    processor = RealDataGreenSpaceProcessor()
    try:
        # Process enhanced green spaces using existing services
        enhanced_green_spaces = await processor.process_all_green_spaces()
        # Save enhanced data
        output_file = processor.save_enhanced_data(enhanced_green_spaces)
        print(f"\n🎉 Successfully created real data enhanced Berlin green spaces!")
        print(f"📁 Output: {output_file}")
    except KeyboardInterrupt:
        print("\n⚠️  Processing interrupted by user")
    except Exception as e:
        print(f"❌ Error processing data: {e}")
        raise
 if __name__ == "__main__":
    asyncio.run(main())
--- a/scripts/osm_filtered_processor.py
+++ b/scripts/osm_filtered_processor.py
@ -0,0 +1,613 @@
 #!/usr/bin/env python3
 """
 Filtered OSM processor for significant Berlin green spaces.
 Processes only meaningful green spaces (>1000 sqm) with real tree and toilet data.
 """
 import json
 import asyncio
 import xml.etree.ElementTree as ET
 from pathlib import Path
 from datetime import datetime
 import sys
 import math
 from typing import List, Dict, Optional, Tuple
 # Add the app directory to Python path
 sys.path.append(str(Path(__file__).parent.parent))
 from app.services.street_tree_service import StreetTreeService
 from app.services.berlin_data_service import BerlinDataService
 class FilteredOSMProcessor:
    def __init__(self, data_dir: str = "app/data"):
        self.data_dir = Path(data_dir)
        self.osm_raw_dir = self.data_dir / "osm-raw"
        self.processed_dir = self.data_dir / "processed"
        # Initialize services
        self.tree_service = StreetTreeService()
        self.berlin_data = BerlinDataService()
        # Berlin bounding box
        self.berlin_bbox = {
            'min_lat': 52.3370, 'max_lat': 52.6755,
            'min_lon': 13.0882, 'max_lon': 13.7611
        }
        # Filtering criteria
        self.min_area_sqm = 1000  # Minimum area to be considered significant
        self.max_spaces = 800     # Maximum number of spaces to process
    def parse_and_filter_osm_data(self) -> List[Dict]:
        """Parse OSM data and filter for significant green spaces."""
        osm_file = self.osm_raw_dir / "berlin_green_spaces.osm"
        if not osm_file.exists():
            print(f"❌ OSM file not found: {osm_file}")
            print("Please run the download first or ensure the file exists.")
            return []
        print(f"📂 Parsing OSM data from {osm_file}")
        try:
            tree = ET.parse(osm_file)
            root = tree.getroot()
            ways = root.findall('.//way')
            print(f"📊 Found {len(ways)} total ways in OSM file")
            print(f"🔍 Filtering for significant green spaces (≥{self.min_area_sqm} sqm)...")
            filtered_spaces = []
            processed_count = 0
            for way in ways:
                processed_count += 1
                if processed_count % 5000 == 0:
                    print(f"   Processed {processed_count}/{len(ways)} ways... Found {len(filtered_spaces)} significant spaces")
                try:
                    space_data = self._process_osm_way(way, root)
                    if space_data and space_data['area_sqm'] >= self.min_area_sqm:
                        filtered_spaces.append(space_data)
                        # Stop if we have enough spaces
                        if len(filtered_spaces) >= self.max_spaces:
                            print(f"✅ Reached target of {self.max_spaces} significant spaces")
                            break
                except Exception as e:
                    continue
            # Sort by area (largest first) to prioritize important spaces
            filtered_spaces.sort(key=lambda x: x['area_sqm'], reverse=True)
            print(f"🎯 Filtered to {len(filtered_spaces)} significant green spaces")
            print(f"📏 Area range: {filtered_spaces[-1]['area_sqm']:,} - {filtered_spaces[0]['area_sqm']:,} sqm")
            return filtered_spaces
        except Exception as e:
            print(f"❌ Error parsing OSM file: {e}")
            return []
    def _process_osm_way(self, way, root) -> Optional[Dict]:
        """Process a single OSM way into green space format."""
        # Get tags
        tags = {}
        for tag in way.findall('tag'):
            tags[tag.get('k')] = tag.get('v')
        # Check if it's a significant green space
        green_space_type = self._get_green_space_type(tags)
        if not green_space_type:
            return None
        # Skip certain types that are usually small or not parks
        skip_types = ['grave_yard', 'cemetery', 'allotments']
        if green_space_type in skip_types:
            return None
        # Get node references
        nd_refs = [nd.get('ref') for nd in way.findall('nd')]
        if len(nd_refs) < 3:  # Need at least 3 points for an area
            return None
        # Find node coordinates
        coordinates = []
        for nd_ref in nd_refs:
            node = root.find(f".//node[@id='{nd_ref}']")
            if node is not None:
                lat = float(node.get('lat'))
                lon = float(node.get('lon'))
                # Check if within Berlin bounds
                if (self.berlin_bbox['min_lat'] <= lat <= self.berlin_bbox['max_lat'] and
                    self.berlin_bbox['min_lon'] <= lon <= self.berlin_bbox['max_lon']):
                    coordinates.append((lat, lon))
        if len(coordinates) < 3:
            return None
        # Calculate centroid and area
        centroid_lat, centroid_lon = self._calculate_centroid(coordinates)
        area_sqm = self._calculate_area(coordinates)
        # Skip if too small
        if area_sqm < self.min_area_sqm:
            return None
        # Get name
        name = tags.get('name')
        if not name:
            name = f"{green_space_type.title()} near {centroid_lat:.3f}, {centroid_lon:.3f}"
        # Estimate district
        district = self._estimate_district(centroid_lat, centroid_lon)
        return {
            'id': f"osm_way_{way.get('id')}",
            'name': name,
            'fclass': green_space_type,
            'lat': centroid_lat,
            'lng': centroid_lon,
            'area_sqm': int(area_sqm),
            'district': district,
            'osm_tags': tags,
            'osm_id': way.get('id'),
            'has_name': bool(tags.get('name'))  # Track if it has a real name
        }
    def _get_green_space_type(self, tags: Dict) -> Optional[str]:
        """Determine if tags represent a significant green space."""
        # Prioritize leisure tags (usually parks)
        leisure = tags.get('leisure', '')
        if leisure in ['park', 'garden', 'nature_reserve', 'recreation_ground', 'playground', 'common']:
            return leisure
        # Check landuse tags
        landuse = tags.get('landuse', '')
        if landuse in ['forest', 'grass', 'meadow', 'recreation_ground', 'village_green']:
            return landuse
        # Check natural tags (forests, etc.)
        natural = tags.get('natural', '')
        if natural in ['forest', 'wood', 'heath']:
            return natural
        return None
    def _calculate_centroid(self, coordinates: List[Tuple[float, float]]) -> Tuple[float, float]:
        """Calculate centroid of polygon."""
        lat_sum = sum(coord[0] for coord in coordinates)
        lon_sum = sum(coord[1] for coord in coordinates)
        count = len(coordinates)
        return lat_sum / count, lon_sum / count
    def _calculate_area(self, coordinates: List[Tuple[float, float]]) -> float:
        """Calculate area using shoelace formula (approximate for Berlin)."""
        if len(coordinates) < 3:
            return 0
        # Convert to approximate meters for Berlin latitude
        lat_to_m = 111000  # meters per degree latitude
        lon_to_m = 111000 * math.cos(math.radians(52.5))  # adjust for Berlin
        # Convert to meters
        coords_m = [(lat * lat_to_m, lon * lon_to_m) for lat, lon in coordinates]
        # Shoelace formula
        area = 0
        n = len(coords_m)
        for i in range(n):
            j = (i + 1) % n
            area += coords_m[i][0] * coords_m[j][1]
            area -= coords_m[j][0] * coords_m[i][1]
        return abs(area) / 2
    def _estimate_district(self, lat: float, lng: float) -> str:
        """Estimate Berlin district from coordinates."""
        # Simplified district boundaries
        if lat > 52.55:
            return "Pankow" if lng < 13.45 else "Lichtenberg"
        elif lat > 52.52:
            if lng < 13.25:
                return "Charlottenburg-Wilmersdorf"
            elif lng < 13.42:
                return "Mitte"
            else:
                return "Friedrichshain-Kreuzberg"
        elif lat > 52.45:
            if lng < 13.25:
                return "Steglitz-Zehlendorf"
            elif lng < 13.42:
                return "Tempelhof-Schöneberg"
            else:
                return "Neukölln"
        else:
            return "Treptow-Köpenick"
    async def enhance_green_space_with_real_data(self, space_data: Dict):
        """Enhance green space with real tree and toilet data."""
        try:
            lat = space_data['lat']
            lng = space_data['lng']
            area_sqm = space_data['area_sqm']
            # Adaptive radius based on space size
            radius = min(400, max(150, int((area_sqm ** 0.5) * 0.8)))
            # Get real data using existing services
            tree_response = await self.tree_service.get_trees_near_location(
                lat, lng, radius_m=radius
            )
            nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 600)
            # Calculate scores
            toilet_score = self._score_toilet_accessibility(nearby_toilets)
            space_type = self._map_to_space_type(space_data.get('fclass', ''))
            enhanced_space = {
                "id": space_data['id'],
                "name": space_data['name'],
                "description": f"Significant Berlin {space_data.get('fclass', 'green space')} from OSM data",
                "type": space_type,
                "coordinates": {
                    "lat": float(lat),
                    "lng": float(lng)
                },
                "neighborhood": space_data.get('district', 'Unknown'),
                "area_sqm": area_sqm,
                "perimeter_m": int(4 * (area_sqm ** 0.5)),  # Rough estimate
                # Environmental features from real tree data
                "environmental": {
                    "tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
                    "shade_quality": tree_response.shade_analysis.shade_quality_score,
                    "noise_level": self._estimate_noise_level(space_data),
                    "wildlife_diversity_score": tree_response.metrics.species_diversity_score,
                    "water_features": self._detect_water_features(space_data),
                    "natural_surface_percent": self._estimate_natural_surface(space_data.get('fclass', ''))
                },
                # Real tree metrics from your existing service
                "tree_data": {
                    "total_trees": tree_response.metrics.total_trees,
                    "trees_per_hectare": tree_response.metrics.trees_per_hectare,
                    "species_count": len(tree_response.metrics.dominant_species),
                    "species_diversity_score": tree_response.metrics.species_diversity_score,
                    "mature_trees_count": tree_response.metrics.mature_trees_count,
                    "young_trees_count": tree_response.metrics.young_trees_count,
                    "average_tree_age": tree_response.metrics.average_tree_age,
                    "average_height": tree_response.metrics.average_height,
                    "average_crown_diameter": tree_response.metrics.average_crown_diameter,
                    "shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
                    "dominant_species": tree_response.metrics.dominant_species[:3]  # Top 3
                },
                # Real toilet accessibility from your existing service
                "toilet_accessibility": {
                    "nearby_toilets_count": len(nearby_toilets),
                    "accessibility_score": toilet_score,
                    "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
                    "free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
                    "accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
                },
                # Accessibility features
                "accessibility": {
                    "wheelchair_accessible": True,
                    "public_transport_score": self._estimate_transport_score(space_data.get('district', '')),
                    "cycling_infrastructure": area_sqm > 5000,
                    "parking_availability": 3 if area_sqm > 50000 else 2,
                    "lighting_quality": 3 if 'mitte' in space_data.get('district', '').lower() else 2
                },
                "recreation": {
                    "playground_quality": self._estimate_playground_quality(space_data),
                    "sports_facilities": self._estimate_sports_facilities(space_data),
                    "running_paths": area_sqm > 8000,
                    "cycling_paths": area_sqm > 15000,
                    "dog_friendly": True,
                    "bbq_allowed": self._allows_bbq(space_data)
                },
                # OSM metadata
                "osm_metadata": {
                    "osm_id": space_data.get('osm_id'),
                    "has_official_name": space_data.get('has_name', False),
                    "tags": space_data.get('osm_tags', {}),
                    "source": "filtered_osm_extract"
                },
                "last_updated": datetime.now().isoformat(),
                "data_sources": ["filtered_osm_extract", "berlin_tree_cadastre", "berlin_toilets"],
                "confidence_score": 95 if space_data.get('has_name') else 85
            }
            return enhanced_space
        except Exception as e:
            print(f"❌ Error enhancing {space_data['name']}: {e}")
            return None
    def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
        if not nearby_toilets:
            return 25
        nearest = nearby_toilets[0]['distance_meters']
        if nearest <= 200:
            score = 95
        elif nearest <= 400:
            score = 80
        elif nearest <= 600:
            score = 65
        else:
            score = 45
        # Quality bonuses
        free = len([t for t in nearby_toilets if t.get('is_free', False)])
        accessible = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
        score += min(10, free * 5 + accessible * 3)
        return min(100, score)
    def _map_to_space_type(self, fclass: str) -> str:
        mapping = {
            'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN', 'wood': 'FOREST',
            'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
            'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
            'common': 'PARK', 'village_green': 'GRASS', 'heath': 'HEATH'
        }
        return mapping.get(fclass, 'PARK')
    def _detect_water_features(self, space_data: Dict) -> bool:
        name = space_data.get('name', '').lower()
        tags = space_data.get('osm_tags', {})
        water_keywords = ['see', 'teich', 'pond', 'lake', 'bach', 'spree', 'wasser', 'fluss']
        return (any(keyword in name for keyword in water_keywords) or 
                'water' in str(tags.values()).lower())
    def _estimate_noise_level(self, space_data: Dict) -> int:
        fclass = space_data.get('fclass', '')
        district = space_data.get('district', '')
        area = space_data.get('area_sqm', 0)
        base = {'forest': 1, 'wood': 1, 'nature_reserve': 1, 'heath': 1,
               'meadow': 2, 'grass': 2, 'park': 2, 'garden': 2, 
               'playground': 3, 'recreation_ground': 3}.get(fclass, 2)
        # Central districts are noisier
        if any(busy in district.lower() for busy in ['mitte', 'kreuzberg', 'friedrichshain']):
            base += 1
        # Larger spaces are usually quieter inside
        if area > 50000:
            base = max(1, base - 1)
        return min(5, base)
    def _estimate_natural_surface(self, fclass: str) -> int:
        return {'forest': 95, 'wood': 95, 'nature_reserve': 95, 'heath': 90,
               'meadow': 95, 'grass': 90, 'park': 80, 'garden': 70, 
               'playground': 45, 'recreation_ground': 75}.get(fclass, 75)
    def _estimate_transport_score(self, district: str) -> int:
        district_lower = district.lower()
        if 'mitte' in district_lower:
            return 5
        elif any(name in district_lower for name in ['charlottenburg', 'kreuzberg', 'friedrichshain', 'pankow']):
            return 4
        else:
            return 3
    def _estimate_playground_quality(self, space_data: Dict) -> int:
        fclass = space_data.get('fclass', '')
        tags = space_data.get('osm_tags', {})
        area = space_data.get('area_sqm', 0)
        if fclass == 'playground':
            return 85
        elif 'playground' in str(tags.values()).lower():
            return 75
        elif fclass == 'park':
            # Larger parks more likely to have good playgrounds
            return 60 if area > 10000 else 45
        else:
            return 30
    def _estimate_sports_facilities(self, space_data: Dict) -> bool:
        fclass = space_data.get('fclass', '')
        tags = space_data.get('osm_tags', {})
        name = space_data.get('name', '').lower()
        area = space_data.get('area_sqm', 0)
        # Explicit indicators
        if (fclass == 'recreation_ground' or 
            'sport' in str(tags.values()).lower() or
            any(term in name for term in ['sport', 'football', 'tennis', 'recreation'])):
            return True
        # Large parks often have sports facilities
        return fclass == 'park' and area > 20000
    def _allows_bbq(self, space_data: Dict) -> bool:
        fclass = space_data.get('fclass', '')
        tags = space_data.get('osm_tags', {})
        area = space_data.get('area_sqm', 0)
        # Check explicit BBQ tags
        bbq_tag = tags.get('bbq', '').lower()
        if bbq_tag == 'yes':
            return True
        elif bbq_tag == 'no':
            return False
        # Default based on type and size
        return fclass in ['park', 'recreation_ground'] and area > 5000
    async def process_filtered_green_spaces(self):
        """Main processing pipeline for filtered green spaces."""
        print("🌳 Processing Significant Berlin Green Spaces")
        print("=" * 55)
        print(f"• Filtering for spaces ≥ {self.min_area_sqm:,} sqm")
        print(f"• Processing up to {self.max_spaces} significant spaces")
        print(f"• Enhancing with real Berlin tree + toilet data")
        print("=" * 55)
        # Step 1: Parse and filter OSM data
        filtered_spaces = self.parse_and_filter_osm_data()
        if not filtered_spaces:
            print("❌ No significant green spaces found")
            return []
        print(f"\n🔧 Enhancing {len(filtered_spaces)} significant spaces with real data...")
        # Step 2: Enhance with real data
        enhanced_spaces = []
        for i, space_data in enumerate(filtered_spaces, 1):
            area_ha = space_data['area_sqm'] / 10000
            print(f"[{i:3d}/{len(filtered_spaces)}] {space_data['name'][:40]:40} ({area_ha:.1f} ha)")
            result = await self.enhance_green_space_with_real_data(space_data)
            if result:
                enhanced_spaces.append(result)
                trees = result["tree_data"]["total_trees"]
                toilets = result["toilet_accessibility"]["nearby_toilets_count"]
                print(f"              ✅ {trees:3d} trees, {toilets} toilets")
            else:
                print(f"              ❌ Enhancement failed")
            # Progress update every 50 spaces
            if i % 50 == 0:
                print(f"\n   📊 Progress: {len(enhanced_spaces)}/{i} enhanced successfully")
            # Small delay to be nice to services
            await asyncio.sleep(0.1)
        print(f"\n🎉 Successfully enhanced {len(enhanced_spaces)} significant green spaces!")
        return enhanced_spaces
    def save_enhanced_data(self, enhanced_spaces: List[Dict]):
        """Save the filtered and enhanced dataset."""
        output_file = self.processed_dir / "significant_berlin_green_spaces.json"
        # Calculate comprehensive statistics
        with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
        with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
        total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)
        total_area = sum(s["area_sqm"] for s in enhanced_spaces)
        # Named vs unnamed spaces
        named_spaces = len([s for s in enhanced_spaces if s["osm_metadata"]["has_official_name"]])
        # Area distribution
        large_spaces = len([s for s in enhanced_spaces if s["area_sqm"] > 50000])  # > 5 hectares
        medium_spaces = len([s for s in enhanced_spaces if 10000 <= s["area_sqm"] <= 50000])  # 1-5 hectares
        small_spaces = len([s for s in enhanced_spaces if s["area_sqm"] < 10000])  # < 1 hectare
        # District breakdown
        by_district = {}
        for space in enhanced_spaces:
            district = space['neighborhood']
            if district not in by_district:
                by_district[district] = []
            by_district[district].append(space)
        data = {
            "green_spaces": enhanced_spaces,
            "total_count": len(enhanced_spaces),
            "last_updated": datetime.now().isoformat(),
            "data_sources": [
                "filtered_osm_extract_significant_spaces_only",
                "berlin_tree_cadastre_via_street_tree_service", 
                "berlin_toilet_locations_via_berlin_data_service"
            ],
            "processing_info": {
                "filtering_criteria": {
                    "minimum_area_sqm": self.min_area_sqm,
                    "maximum_spaces_processed": self.max_spaces,
                    "includes_only_significant_spaces": True
                },
                "enhancement_method": "real_berlin_tree_and_toilet_data",
                "coordinate_system": "WGS84"
            },
            "summary_stats": {
                "total_spaces": len(enhanced_spaces),
                "spaces_with_tree_data": with_trees,
                "spaces_with_toilet_data": with_toilets,
                "total_trees_analyzed": total_trees,
                "total_area_hectares": round(total_area / 10000, 1),
                "coverage_rates": {
                    "tree_data": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%",
                    "toilet_data": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%"
                },
                "space_categories": {
                    "named_spaces": named_spaces,
                    "unnamed_spaces": len(enhanced_spaces) - named_spaces,
                    "large_spaces_over_5ha": large_spaces,
                    "medium_spaces_1_5ha": medium_spaces,
                    "smaller_spaces_under_1ha": small_spaces
                }
            },
            "district_breakdown": {
                district: len(spaces) for district, spaces in by_district.items()
            }
        }
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"\n📁 Comprehensive dataset saved: {output_file}")
        print(f"\n📊 Final Statistics:")
        print(f"   🌳 {len(enhanced_spaces)} significant green spaces")
        print(f"   📛 {named_spaces} with official names, {len(enhanced_spaces) - named_spaces} discovered areas")
        print(f"   🌲 {with_trees} spaces with tree data ({round((with_trees/len(enhanced_spaces))*100)}%)")
        print(f"   🚻 {with_toilets} spaces with toilet data ({round((with_toilets/len(enhanced_spaces))*100)}%)")
        print(f"   🌿 {total_trees:,} total trees analyzed")
        print(f"   📏 {round(total_area/10000, 1)} hectares total area")
        print(f"\n🏙️ District Distribution:")
        for district, spaces in sorted(by_district.items(), key=lambda x: len(x[1]), reverse=True):
            print(f"   • {district}: {len(spaces)} spaces")
        print(f"\n📈 Size Categories:")
        print(f"   • Large (>5 ha): {large_spaces} spaces")
        print(f"   • Medium (1-5 ha): {medium_spaces} spaces") 
        print(f"   • Smaller (<1 ha): {small_spaces} spaces")
        print(f"\n✨ This dataset provides comprehensive coverage of Berlin's")
        print(f"    significant green spaces with real tree and toilet data!")
        return output_file
 async def main():
    processor = FilteredOSMProcessor()
    try:
        enhanced_spaces = await processor.process_filtered_green_spaces()
        if enhanced_spaces:
            processor.save_enhanced_data(enhanced_spaces)
            print(f"\n🎯 SUCCESS! Ready to use in your API for accurate personality scoring!")
        else:
            print("❌ No spaces were successfully processed.")
    except KeyboardInterrupt:
        print("\n⚠️ Process interrupted by user")
    except Exception as e:
        print(f"❌ Error: {e}")
        raise
 if __name__ == "__main__":
    asyncio.run(main())
--- a/scripts/osm_local_processor.py
+++ b/scripts/osm_local_processor.py
@ -0,0 +1,613 @@
 #!/usr/bin/env python3
 """
 Process Berlin green spaces from local OSM data file.
 Downloads Berlin OSM extract once, then processes locally without API dependencies.
 """
 import json
 import requests
 import asyncio
 import xml.etree.ElementTree as ET
 from pathlib import Path
 from datetime import datetime
 from typing import List, Dict, Optional, Tuple
 import sys
 import gzip
 import math
 # Add the app directory to Python path to import services
 sys.path.append(str(Path(__file__).parent.parent))
 from app.services.street_tree_service import StreetTreeService
 from app.services.berlin_data_service import BerlinDataService
 class LocalOSMProcessor:
    def __init__(self, data_dir: str = "app/data"):
        self.data_dir = Path(data_dir)
        self.raw_dir = self.data_dir / "osm-raw"
        self.processed_dir = self.data_dir / "processed"
        # Create directories
        self.raw_dir.mkdir(parents=True, exist_ok=True)
        self.processed_dir.mkdir(parents=True, exist_ok=True)
        # Initialize existing services
        self.tree_service = StreetTreeService()
        self.berlin_data = BerlinDataService()
        # Berlin bounding box for filtering
        self.berlin_bbox = {
            'min_lat': 52.3370, 'max_lat': 52.6755,
            'min_lon': 13.0882, 'max_lon': 13.7611
        }
    def download_berlin_osm_extract(self):
        """Download Berlin OSM extract from Geofabrik."""
        osm_file = self.raw_dir / "berlin-latest.osm.pbf"
        if osm_file.exists():
            print(f"✅ OSM file already exists: {osm_file}")
            return osm_file
        # Try PBF format first (smaller), fallback to XML
        urls = [
            "https://download.geofabrik.de/europe/germany/berlin-latest.osm.pbf",
            "https://download.geofabrik.de/europe/germany/berlin-latest.osm.bz2"
        ]
        for url in urls:
            try:
                print(f"Downloading Berlin OSM data from {url}")
                print("This is a one-time download (~50MB)...")
                response = requests.get(url, stream=True, timeout=300)
                response.raise_for_status()
                filename = url.split('/')[-1]
                local_file = self.raw_dir / filename
                # Download with progress
                total_size = int(response.headers.get('content-length', 0))
                downloaded = 0
                with open(local_file, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                            downloaded += len(chunk)
                            if total_size > 0:
                                percent = (downloaded / total_size) * 100
                                print(f"\rDownload progress: {percent:.1f}%", end="")
                print(f"\n✅ Downloaded: {local_file}")
                return local_file
            except Exception as e:
                print(f"❌ Failed to download {url}: {e}")
                continue
        raise Exception("Could not download OSM data from any source")
    def download_simple_osm_extract(self):
        """Download simpler XML format if PBF tools not available."""
        osm_file = self.raw_dir / "berlin_green_spaces.osm"
        if osm_file.exists():
            print(f"✅ OSM file already exists: {osm_file}")
            return osm_file
        # Use Overpass API to get a one-time export of green spaces
        print("Downloading Berlin green spaces extract...")
        overpass_url = "http://overpass-api.de/api/interpreter"
        # Query for all green spaces in Berlin (one-time download)
        query = f"""
        [out:xml][timeout:120];
        (
          way["leisure"~"^(park|garden|nature_reserve|recreation_ground|playground|common)$"]
             ({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
          way["landuse"~"^(forest|grass|meadow|recreation_ground|village_green|allotments)$"]
             ({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
          way["natural"~"^(forest|grass|meadow|scrub|heath|wood)$"]
             ({self.berlin_bbox['min_lat']},{self.berlin_bbox['min_lon']},{self.berlin_bbox['max_lat']},{self.berlin_bbox['max_lon']});
        );
        out geom meta;
        """
        try:
            response = requests.post(overpass_url, data=query, timeout=180)
            response.raise_for_status()
            with open(osm_file, 'w', encoding='utf-8') as f:
                f.write(response.text)
            print(f"✅ Downloaded green spaces extract: {osm_file}")
            return osm_file
        except Exception as e:
            print(f"❌ Failed to download OSM extract: {e}")
            raise
    def parse_osm_xml(self, osm_file: Path) -> List[Dict]:
        """Parse OSM XML file to extract green spaces."""
        print(f"Parsing OSM data from {osm_file}...")
        green_spaces = []
        try:
            # Handle different file formats
            if osm_file.suffix == '.gz':
                with gzip.open(osm_file, 'rt', encoding='utf-8') as f:
                    tree = ET.parse(f)
            else:
                tree = ET.parse(osm_file)
            root = tree.getroot()
            # Parse ways (areas)
            ways = root.findall('.//way')
            print(f"Found {len(ways)} ways in OSM data")
            for way in ways:
                try:
                    processed_space = self._process_osm_way(way, root)
                    if processed_space:
                        green_spaces.append(processed_space)
                except Exception as e:
                    continue
            print(f"✅ Extracted {len(green_spaces)} green spaces from OSM data")
            return green_spaces
        except Exception as e:
            print(f"❌ Error parsing OSM file: {e}")
            return []
    def _process_osm_way(self, way, root) -> Optional[Dict]:
        """Process a single OSM way into green space format."""
        # Get tags
        tags = {}
        for tag in way.findall('tag'):
            tags[tag.get('k')] = tag.get('v')
        # Check if it's a green space
        green_space_type = self._get_green_space_type(tags)
        if not green_space_type:
            return None
        # Get node references
        nd_refs = [nd.get('ref') for nd in way.findall('nd')]
        if len(nd_refs) < 3:  # Need at least 3 points for an area
            return None
        # Find node coordinates
        coordinates = []
        for nd_ref in nd_refs:
            node = root.find(f".//node[@id='{nd_ref}']")
            if node is not None:
                lat = float(node.get('lat'))
                lon = float(node.get('lon'))
                # Check if within Berlin bounds
                if (self.berlin_bbox['min_lat'] <= lat <= self.berlin_bbox['max_lat'] and
                    self.berlin_bbox['min_lon'] <= lon <= self.berlin_bbox['max_lon']):
                    coordinates.append((lat, lon))
        if len(coordinates) < 3:
            return None
        # Calculate centroid and area
        centroid_lat, centroid_lon = self._calculate_centroid(coordinates)
        area_sqm = self._calculate_area(coordinates)
        # Skip very small areas
        if area_sqm < 500:
            return None
        # Get name
        name = tags.get('name', f"{green_space_type.title()} near {centroid_lat:.3f}, {centroid_lon:.3f}")
        # Estimate district
        district = self._estimate_district(centroid_lat, centroid_lon)
        return {
            'id': f"osm_way_{way.get('id')}",
            'name': name,
            'fclass': green_space_type,
            'lat': centroid_lat,
            'lng': centroid_lon,
            'area_sqm': int(area_sqm),
            'district': district,
            'osm_tags': tags,
            'osm_id': way.get('id')
        }
    def _get_green_space_type(self, tags: Dict) -> Optional[str]:
        """Determine if tags represent a green space and what type."""
        # Check leisure tags
        leisure = tags.get('leisure', '')
        if leisure in ['park', 'garden', 'nature_reserve', 'recreation_ground', 
                      'playground', 'common', 'golf_course']:
            return leisure
        # Check landuse tags
        landuse = tags.get('landuse', '')
        if landuse in ['forest', 'grass', 'meadow', 'recreation_ground', 
                      'village_green', 'allotments']:
            return landuse
        # Check natural tags
        natural = tags.get('natural', '')
        if natural in ['forest', 'grass', 'meadow', 'scrub', 'heath', 'wood']:
            return natural
        return None
    def _calculate_centroid(self, coordinates: List[Tuple[float, float]]) -> Tuple[float, float]:
        """Calculate centroid of polygon."""
        lat_sum = sum(coord[0] for coord in coordinates)
        lon_sum = sum(coord[1] for coord in coordinates)
        count = len(coordinates)
        return lat_sum / count, lon_sum / count
    def _calculate_area(self, coordinates: List[Tuple[float, float]]) -> float:
        """Calculate area of polygon using shoelace formula."""
        if len(coordinates) < 3:
            return 0
        # Convert to approximate meters for Berlin
        lat_to_m = 111000  # meters per degree latitude
        lon_to_m = 111000 * math.cos(math.radians(52.5))  # adjust for Berlin latitude
        # Convert coordinates to meters
        coords_m = [(lat * lat_to_m, lon * lon_to_m) for lat, lon in coordinates]
        # Shoelace formula
        area = 0
        n = len(coords_m)
        for i in range(n):
            j = (i + 1) % n
            area += coords_m[i][0] * coords_m[j][1]
            area -= coords_m[j][0] * coords_m[i][1]
        return abs(area) / 2
    def _estimate_district(self, lat: float, lng: float) -> str:
        """Rough district estimation from coordinates."""
        # Very rough Berlin district boundaries
        if lat > 52.55:
            return "Pankow" if lng < 13.45 else "Lichtenberg"
        elif lat > 52.52:
            if lng < 13.25:
                return "Charlottenburg-Wilmersdorf"
            elif lng < 13.42:
                return "Mitte"
            else:
                return "Friedrichshain-Kreuzberg"
        elif lat > 52.45:
            if lng < 13.25:
                return "Steglitz-Zehlendorf"
            elif lng < 13.42:
                return "Tempelhof-Schöneberg"
            else:
                return "Neukölln"
        else:
            return "Treptow-Köpenick"
    async def enhance_green_space_with_real_data(self, space_data: Dict):
        """Enhance green space with real tree and toilet data."""
        try:
            lat = space_data['lat']
            lng = space_data['lng']
            area_sqm = space_data['area_sqm']
            print(f"Enhancing {space_data['name']} ({space_data['district']})...")
            # Adaptive radius
            radius = min(350, max(100, int((area_sqm ** 0.5) * 0.7)))
            # Get real data using existing services
            tree_response = await self.tree_service.get_trees_near_location(
                lat, lng, radius_m=radius
            )
            nearby_toilets = await self.berlin_data.get_toilets_near_point(lat, lng, 600)
            # Calculate scores
            toilet_score = self._score_toilet_accessibility(nearby_toilets)
            space_type = self._map_to_space_type(space_data.get('fclass', ''))
            enhanced_space = {
                "id": space_data['id'],
                "name": space_data['name'],
                "description": f"Berlin {space_data.get('fclass', 'green space')} from local OSM data",
                "type": space_type,
                "coordinates": {
                    "lat": float(lat),
                    "lng": float(lng)
                },
                "neighborhood": space_data.get('district', 'Unknown'),
                "area_sqm": area_sqm,
                "perimeter_m": int(4 * (area_sqm ** 0.5)),
                # Environmental features from real tree data
                "environmental": {
                    "tree_coverage_percent": max(5, int(tree_response.shade_analysis.estimated_shade_coverage)),
                    "shade_quality": tree_response.shade_analysis.shade_quality_score,
                    "noise_level": self._estimate_noise_level(space_data),
                    "wildlife_diversity_score": tree_response.metrics.species_diversity_score,
                    "water_features": self._detect_water_features(space_data),
                    "natural_surface_percent": self._estimate_natural_surface(space_data.get('fclass', ''))
                },
                # Real tree metrics
                "tree_data": {
                    "total_trees": tree_response.metrics.total_trees,
                    "trees_per_hectare": tree_response.metrics.trees_per_hectare,
                    "species_count": len(tree_response.metrics.dominant_species),
                    "species_diversity_score": tree_response.metrics.species_diversity_score,
                    "mature_trees_count": tree_response.metrics.mature_trees_count,
                    "young_trees_count": tree_response.metrics.young_trees_count,
                    "average_tree_age": tree_response.metrics.average_tree_age,
                    "average_height": tree_response.metrics.average_height,
                    "average_crown_diameter": tree_response.metrics.average_crown_diameter,
                    "shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
                    "dominant_species": tree_response.metrics.dominant_species[:3]
                },
                # Real toilet accessibility
                "toilet_accessibility": {
                    "nearby_toilets_count": len(nearby_toilets),
                    "accessibility_score": toilet_score,
                    "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
                    "free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
                    "accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
                },
                # Standard features
                "accessibility": {
                    "wheelchair_accessible": True,
                    "public_transport_score": self._estimate_transport_score(space_data.get('district', '')),
                    "cycling_infrastructure": area_sqm > 4000,
                    "parking_availability": 2 if area_sqm > 20000 else 1,
                    "lighting_quality": 3 if 'mitte' in space_data.get('district', '').lower() else 2
                },
                "recreation": {
                    "playground_quality": self._estimate_playground_quality(space_data),
                    "sports_facilities": self._estimate_sports_facilities(space_data),
                    "running_paths": area_sqm > 6000,
                    "cycling_paths": area_sqm > 12000,
                    "dog_friendly": True,
                    "bbq_allowed": self._allows_bbq(space_data)
                },
                # OSM metadata
                "osm_metadata": {
                    "osm_id": space_data.get('osm_id'),
                    "tags": space_data.get('osm_tags', {}),
                    "source": "local_osm_extract"
                },
                "last_updated": datetime.now().isoformat(),
                "data_sources": ["local_osm_extract", "berlin_tree_cadastre", "berlin_toilets"],
                "confidence_score": 92
            }
            trees = tree_response.metrics.total_trees
            toilets = len(nearby_toilets)
            print(f"✅ {space_data['name']}: {trees} trees, {toilets} toilets")
            return enhanced_space
        except Exception as e:
            print(f"❌ Error enhancing {space_data['name']}: {e}")
            return None
    def _score_toilet_accessibility(self, nearby_toilets: List[Dict]) -> int:
        if not nearby_toilets:
            return 25
        nearest = nearby_toilets[0]['distance_meters']
        if nearest <= 200:
            score = 90
        elif nearest <= 400:
            score = 70
        else:
            score = 50
        # Quality bonuses
        free = len([t for t in nearby_toilets if t.get('is_free', False)])
        accessible = len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
        score += min(10, free * 5 + accessible * 3)
        return min(100, score)
    def _map_to_space_type(self, fclass: str) -> str:
        mapping = {
            'park': 'PARK', 'forest': 'FOREST', 'garden': 'GARDEN', 'wood': 'FOREST',
            'nature_reserve': 'NATURE_RESERVE', 'playground': 'PLAYGROUND',
            'meadow': 'MEADOW', 'grass': 'GRASS', 'recreation_ground': 'PARK',
            'common': 'PARK', 'village_green': 'GRASS', 'allotments': 'GARDEN'
        }
        return mapping.get(fclass, 'PARK')
    def _detect_water_features(self, space_data: Dict) -> bool:
        name = space_data.get('name', '').lower()
        tags = space_data.get('osm_tags', {})
        water_keywords = ['see', 'teich', 'pond', 'lake', 'bach', 'spree', 'wasser']
        return any(keyword in name for keyword in water_keywords) or 'water' in tags.values()
    def _estimate_noise_level(self, space_data: Dict) -> int:
        fclass = space_data.get('fclass', '')
        district = space_data.get('district', '')
        base = {'forest': 1, 'wood': 1, 'nature_reserve': 1, 'meadow': 2, 
               'park': 2, 'garden': 2, 'playground': 3}.get(fclass, 2)
        if any(busy in district.lower() for busy in ['mitte', 'kreuzberg', 'friedrichshain']):
            base += 1
        return min(5, base)
    def _estimate_natural_surface(self, fclass: str) -> int:
        return {'forest': 95, 'wood': 95, 'nature_reserve': 90, 'meadow': 95,
               'grass': 85, 'park': 75, 'garden': 65, 'playground': 40}.get(fclass, 70)
    def _estimate_transport_score(self, district: str) -> int:
        district_lower = district.lower()
        if 'mitte' in district_lower:
            return 5
        elif any(name in district_lower for name in ['charlottenburg', 'kreuzberg', 'friedrichshain']):
            return 4
        else:
            return 3
    def _estimate_playground_quality(self, space_data: Dict) -> int:
        fclass = space_data.get('fclass', '')
        tags = space_data.get('osm_tags', {})
        if fclass == 'playground':
            return 80
        elif 'playground' in tags.values():
            return 75
        elif fclass == 'park':
            return 55
        else:
            return 30
    def _estimate_sports_facilities(self, space_data: Dict) -> bool:
        fclass = space_data.get('fclass', '')
        tags = space_data.get('osm_tags', {})
        name = space_data.get('name', '').lower()
        return (fclass == 'recreation_ground' or 
               'sport' in str(tags.values()).lower() or
               any(term in name for term in ['sport', 'football', 'tennis']))
    def _allows_bbq(self, space_data: Dict) -> bool:
        fclass = space_data.get('fclass', '')
        area = space_data.get('area_sqm', 0)
        tags = space_data.get('osm_tags', {})
        # Check explicit BBQ tags
        if tags.get('bbq') == 'yes':
            return True
        elif tags.get('bbq') == 'no':
            return False
        # Default based on type and size
        return fclass in ['park', 'recreation_ground'] and area > 5000
    async def process_all_green_spaces(self):
        """Main processing pipeline."""
        print("🌳 Processing Berlin green spaces from local OSM data...")
        # Step 1: Get OSM data
        try:
            osm_file = self.download_simple_osm_extract()  # More reliable than PBF
        except:
            print("❌ Could not download OSM data")
            return []
        # Step 2: Parse green spaces
        green_spaces = self.parse_osm_xml(osm_file)
        if not green_spaces:
            print("❌ No green spaces found in OSM data")
            return []
        print(f"📊 Found {len(green_spaces)} green spaces to enhance")
        # Step 3: Enhance with real data
        enhanced_spaces = []
        for i, space_data in enumerate(green_spaces, 1):
            print(f"[{i}/{len(green_spaces)}]", end=" ")
            result = await self.enhance_green_space_with_real_data(space_data)
            if result:
                enhanced_spaces.append(result)
            if i % 20 == 0:
                print(f"\n   Progress: {len(enhanced_spaces)} enhanced so far...")
            await asyncio.sleep(0.1)
        print(f"\n✅ Enhanced {len(enhanced_spaces)} spaces with real data!")
        return enhanced_spaces
    def save_enhanced_data(self, enhanced_spaces: List[Dict]):
        """Save the final dataset."""
        output_file = self.processed_dir / "osm_berlin_green_spaces_enhanced.json"
        # Calculate statistics
        with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
        with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
        total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)
        data = {
            "green_spaces": enhanced_spaces,
            "total_count": len(enhanced_spaces),
            "last_updated": datetime.now().isoformat(),
            "data_sources": [
                "local_osm_extract_processed_offline",
                "berlin_tree_cadastre", 
                "berlin_toilets"
            ],
            "processing_info": {
                "method": "local_osm_processing_no_api_dependency",
                "includes_all_osm_green_spaces": True,
                "enhanced_with_real_berlin_data": True
            },
            "summary_stats": {
                "total_spaces": len(enhanced_spaces),
                "spaces_with_tree_data": with_trees,
                "spaces_with_toilet_data": with_toilets,
                "total_trees_analyzed": total_trees,
                "tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%",
                "toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%"
            }
        }
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"\n🎉 Saved comprehensive dataset: {output_file}")
        print(f"📊 {len(enhanced_spaces)} total green spaces")
        print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data")
        print(f"🌿 {total_trees} total trees analyzed")
        print(f"\n✨ Ready to replace mock data in your API!")
        return output_file
 async def main():
    processor = LocalOSMProcessor()
    try:
        print("🚀 Berlin Green Spaces: Local OSM Processing")
        print("=" * 50)
        print("• Downloads OSM data once (no API dependency)")
        print("• Processes locally for all green spaces")
        print("• Enhances with real Berlin tree + toilet data")
        print("=" * 50)
        enhanced_spaces = await processor.process_all_green_spaces()
        if enhanced_spaces:
            processor.save_enhanced_data(enhanced_spaces)
    except KeyboardInterrupt:
        print("\n⚠️ Interrupted")
    except Exception as e:
        print(f"❌ Error: {e}")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/scripts/quick_green_spaces.py
+++ b/scripts/quick_green_spaces.py
@ -0,0 +1,558 @@
 #!/usr/bin/env python3
 """
 Quick Berlin green spaces processor.
 Pre-filters OSM data efficiently, then processes only the best candidates.
 """
 import json
 import asyncio
 import xml.etree.ElementTree as ET
 from pathlib import Path
 from datetime import datetime
 import sys
 import re
 import math
 # from tqdm.asyncio import tqdm  # Not available, remove tqdm dependency
 from xml.etree.ElementTree import iterparse
 # Add the app directory to Python path
 sys.path.append(str(Path(__file__).parent.parent))
 from app.services.street_tree_service import StreetTreeService
 from app.services.berlin_data_service import BerlinDataService
 def calculate_polygon_area_sqm(coords):
    """Calculate area of a polygon using the Shoelace formula."""
    if len(coords) < 3:
        return 5000  # Default for invalid polygons
    # Convert to radians and use spherical approximation for Earth
    def to_radians(deg):
        return deg * math.pi / 180
    # Use simple planar approximation for small areas
    # Convert lat/lng to approximate meters (rough approximation for Berlin area)
    lat_center = sum(lat for lat, lng in coords) / len(coords)
    lng_center = sum(lng for lat, lng in coords) / len(coords)
    # Approximate meters per degree at Berlin latitude
    meters_per_lat = 111320  # roughly constant
    meters_per_lng = 111320 * math.cos(to_radians(lat_center))
    # Convert coordinates to meters relative to center
    meter_coords = []
    for lat, lng in coords:
        x = (lng - lng_center) * meters_per_lng
        y = (lat - lat_center) * meters_per_lat
        meter_coords.append((x, y))
    # Shoelace formula
    area = 0
    n = len(meter_coords)
    for i in range(n):
        j = (i + 1) % n
        area += meter_coords[i][0] * meter_coords[j][1]
        area -= meter_coords[j][0] * meter_coords[i][1]
    area = abs(area) / 2
    # Reasonable bounds check
    if area < 100:  # Too small
        return 5000
    elif area > 10000000:  # Too large (10 km²)
        return 500000  # Cap at reasonable park size
    return int(area)
 def calculate_search_radius(area_sqm):
    """Calculate appropriate tree search radius based on park area."""
    if area_sqm < 10000:  # < 1 hectare
        return 150
    elif area_sqm < 50000:  # < 5 hectares  
        return 300
    elif area_sqm < 200000:  # < 20 hectares
        return 500
    else:  # Large parks like Treptower Park
        return 800
 def calculate_enhanced_shade_quality(tree_response, area_sqm):
    """Calculate enhanced shade quality based on real tree characteristics."""
    metrics = tree_response.metrics
    shade_analysis = tree_response.shade_analysis
    # Base score from tree density and coverage
    base_score = 0
    # Factor 1: Actual shade coverage (crown area based)
    coverage = metrics.shade_coverage_percent or 0
    if coverage >= 60:
        base_score += 40
    elif coverage >= 40:
        base_score += 30
    elif coverage >= 20:
        base_score += 20
    elif coverage >= 10:
        base_score += 10
    # Factor 2: Large mature trees (better shade)
    large_trees = len(shade_analysis.nearby_large_trees or [])
    if large_trees >= 10:
        base_score += 25
    elif large_trees >= 5:
        base_score += 20
    elif large_trees >= 3:
        base_score += 15
    elif large_trees >= 1:
        base_score += 10
    # Factor 3: Tree density per area
    trees_per_hectare = metrics.trees_per_hectare or 0
    if trees_per_hectare >= 50:
        base_score += 20
    elif trees_per_hectare >= 30:
        base_score += 15
    elif trees_per_hectare >= 20:
        base_score += 10
    elif trees_per_hectare >= 10:
        base_score += 5
    # Factor 4: Average tree height (taller = better shade)
    avg_height = metrics.average_height or 0
    if avg_height >= 20:
        base_score += 10
    elif avg_height >= 15:
        base_score += 8
    elif avg_height >= 10:
        base_score += 5
    elif avg_height >= 5:
        base_score += 3
    # Factor 5: Crown diameter quality
    avg_crown = metrics.average_crown_diameter or 0
    if avg_crown >= 12:
        base_score += 5
    elif avg_crown >= 8:
        base_score += 3
    elif avg_crown >= 5:
        base_score += 1
    return min(100, base_score)
 def detect_water_features(candidate):
    """Detect water features using OSM tags and name analysis."""
    tags = candidate.get('tags', {})
    name = candidate.get('name', '').lower()
    # Check OSM water-related tags
    water_tags = ['water', 'waterway', 'natural']
    has_water_tags = any(
        tags.get(tag, '').lower() in ['water', 'lake', 'pond', 'reservoir', 'river', 'stream'] 
        for tag in water_tags
    )
    # Check name for water indicators
    water_names = ['see', 'teich', 'weiher', 'water', 'lake', 'pond', 'fluss', 'river', 'bach', 'creek']
    has_water_name = any(water_word in name for water_word in water_names)
    # Check for fountain/brunnen
    fountain_indicators = ['brunnen', 'fountain', 'springbrunnen']
    has_fountain = any(fountain in name for fountain in fountain_indicators)
    return has_water_tags or has_water_name or has_fountain
 def estimate_berlin_district(lat: float, lng: float) -> str:
    """Estimate Berlin district from coordinates using geographic boundaries."""
    # Northern districts
    if lat > 52.55:
        if lng < 13.25:
            return "Reinickendorf"
        elif lng < 13.45:
            return "Pankow"
        else:
            return "Lichtenberg"
    # Central-north districts
    elif lat > 52.52:
        if lng < 13.20:
            return "Spandau"
        elif lng < 13.30:
            return "Charlottenburg-Wilmersdorf"
        elif lng < 13.42:
            return "Mitte"
        elif lng < 13.48:
            return "Friedrichshain-Kreuzberg"
        else:
            return "Lichtenberg"
    # Central districts
    elif lat > 52.48:
        if lng < 13.20:
            return "Spandau"
        elif lng < 13.30:
            return "Charlottenburg-Wilmersdorf"
        elif lng < 13.35:
            return "Tempelhof-Schöneberg"
        elif lng < 13.42:
            return "Mitte"
        elif lng < 13.48:
            return "Friedrichshain-Kreuzberg"
        else:
            return "Lichtenberg"
    # Southern-central districts
    elif lat > 52.45:
        if lng < 13.20:
            return "Steglitz-Zehlendorf"
        elif lng < 13.35:
            return "Tempelhof-Schöneberg"
        elif lng < 13.45:
            return "Neukölln"
        elif lng < 13.55:
            return "Treptow-Köpenick"
        else:
            return "Marzahn-Hellersdorf"
    # Southern districts
    else:
        if lng < 13.35:
            return "Steglitz-Zehlendorf"
        else:
            return "Treptow-Köpenick"
 def get_specific_neighborhood(district: str, lat: float, lng: float) -> str:
    """Get specific neighborhood within district based on coordinates."""
    neighborhoods = {
        "Mitte": {
            (52.540, 52.560, 13.33, 13.38): "Wedding",
            (52.515, 52.530, 13.33, 13.38): "Moabit",
            (52.510, 52.520, 13.35, 13.38): "Tiergarten",
            (52.525, 52.545, 13.40, 13.43): "Prenzlauer Berg"
        },
        "Charlottenburg-Wilmersdorf": {
            (52.485, 52.505, 13.30, 13.33): "Wilmersdorf",
            (52.505, 52.525, 13.25, 13.33): "Charlottenburg"
        },
        "Friedrichshain-Kreuzberg": {
            (52.490, 52.510, 13.38, 13.42): "Kreuzberg",
            (52.510, 52.525, 13.42, 13.48): "Friedrichshain"
        },
        "Tempelhof-Schöneberg": {
            (52.480, 52.500, 13.33, 13.37): "Schöneberg",
            (52.460, 52.480, 13.37, 13.42): "Tempelhof"
        },
        "Steglitz-Zehlendorf": {
            (52.430, 52.450, 13.23, 13.30): "Zehlendorf",
            (52.450, 52.470, 13.30, 13.35): "Steglitz"
        },
        "Treptow-Köpenick": {
            (52.430, 52.460, 13.55, 13.65): "Köpenick",
            (52.480, 52.500, 13.45, 13.50): "Treptow"
        }
    }
    if district in neighborhoods:
        for (min_lat, max_lat, min_lng, max_lng), neighborhood in neighborhoods[district].items():
            if min_lat <= lat <= max_lat and min_lng <= lng <= max_lng:
                return neighborhood
    return district
 async def quick_process():
    """Quick processing of significant Berlin green spaces."""
    print("🚀 Quick Berlin Green Spaces Processor")
    print("=" * 45)
    # Initialize services
    tree_service = StreetTreeService()
    berlin_data = BerlinDataService()
    # Pre-load and index trees once to avoid repeated indexing
    print("🔄 Pre-loading tree data and building spatial index...")
    await tree_service._load_trees()
    osm_file = Path("app/data/osm-raw/berlin_green_spaces.osm")
    if not osm_file.exists():
        print("❌ OSM file not found. Please ensure data is downloaded.")
        return
    print("🔍 Quick filtering for named parks and significant areas...")
    print(f"📁 OSM file size: {osm_file.stat().st_size / (1024*1024):.1f} MB")
    # Quick scan for good candidates
    candidates = []
    try:
        processed = 0
        print("🔍 Single-pass XML parsing - ways with embedded coordinates...")
        # Single pass: parse ways with embedded coordinates
        ways_processed = 0
        current_way_tags = {}
        current_way_coordinates = []
        in_way = False
        for event, elem in iterparse(osm_file, events=('start', 'end')):
            if event == 'start':
                if elem.tag == 'way':
                    in_way = True
                    current_way_tags = {}
                    current_way_coordinates = []
                    ways_processed += 1
                    if ways_processed % 1000 == 0:
                        print(f"Processed {ways_processed} ways, found {len(candidates)} candidates so far...")
                elif in_way and elem.tag == 'tag':
                    k = elem.get('k')
                    v = elem.get('v')
                    if k and v:
                        current_way_tags[k] = v
                elif in_way and elem.tag == 'nd':
                    # Extract coordinates directly from nd element
                    lat = elem.get('lat')
                    lon = elem.get('lon')
                    if lat and lon:
                        current_way_coordinates.append((float(lat), float(lon)))
                continue
            if elem.tag == 'way' and in_way:
                in_way = False
                tags = current_way_tags
                coordinates = current_way_coordinates
                # Quick filters for promising spaces - be more lenient
                has_name = 'name' in tags
                is_park = (tags.get('leisure') in ['park', 'garden', 'nature_reserve'] or 
                          tags.get('landuse') in ['forest', 'grass', 'recreation_ground'])
                # Also accept common green space tags
                has_green_tags = any(key in tags for key in ['leisure', 'landuse', 'natural', 'amenity'])
                if not (has_name or is_park or has_green_tags):
                    elem.clear()  # Free memory
                    continue
                # Use embedded coordinates directly
                if not coordinates:
                    elem.clear()  # Free memory
                    continue
                # Get center coordinate and all coordinates for area calculation
                lat, lng = coordinates[0] if len(coordinates) == 1 else (
                    sum(lat for lat, lng in coordinates) / len(coordinates),
                    sum(lng for lat, lng in coordinates) / len(coordinates)
                )
                # Basic Berlin bounds check
                if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8):
                    elem.clear()  # Free memory
                    continue
                name = tags.get('name', f"Unnamed {tags.get('leisure', tags.get('landuse', 'area'))}")
                space_type = tags.get('leisure') or tags.get('landuse') or 'park'
                candidate = {
                    'id': f"quick_{elem.get('id')}",
                    'name': name,
                    'type': space_type,
                    'lat': lat,
                    'lng': lng,
                    'has_name': has_name,
                    'tags': tags,
                    'coordinates': coordinates  # Store all coordinates for area calculation
                }
                candidates.append(candidate)
                processed += 1
                # Limit for quick processing
                if len(candidates) >= 100:
                    elem.clear()  # Free memory
                    break
                elem.clear()  # Free memory
            else:
                elem.clear()  # Free memory
        print(f"✅ Found {len(candidates)} promising green spaces")
    except Exception as e:
        print(f"❌ Error in quick filtering: {e}")
        return
    if not candidates:
        print("No candidates found")
        return
    # Sort by having names (better quality)
    candidates.sort(key=lambda x: x['has_name'], reverse=True)
    print(f"\n🔧 Enhancing top {len(candidates)} spaces with real data...")
    # Process candidates in parallel with batching
    batch_size = 10  # Process 10 candidates at a time
    enhanced_spaces = []
    async def process_candidate(candidate):
        """Process a single candidate with tree and toilet data."""
        try:
            # Calculate actual area from OSM polygon coordinates
            area_sqm = calculate_polygon_area_sqm(candidate.get('coordinates', []))
            search_radius = calculate_search_radius(area_sqm)
            # Get real tree data and toilet data concurrently with dynamic radius
            tree_task = tree_service.get_trees_near_location(
                candidate['lat'], candidate['lng'], radius_m=search_radius
            )
            toilet_task = berlin_data.get_toilets_near_point(
                candidate['lat'], candidate['lng'], 500
            )
            print(f"🔍 Getting data for {candidate['name'][:30]}... (area: {area_sqm/10000:.1f}ha, radius: {search_radius}m)")
            tree_response, nearby_toilets = await asyncio.gather(tree_task, toilet_task)
            # Create enhanced space
            enhanced_space = {
                "id": candidate['id'],
                "name": candidate['name'],
                "description": f"Berlin {candidate['type']} discovered via quick OSM processing",
                "type": "PARK",  # Simplified for now
                "coordinates": {
                    "lat": candidate['lat'],
                    "lng": candidate['lng']
                },
                "neighborhood": get_specific_neighborhood(estimate_berlin_district(candidate['lat'], candidate['lng']), candidate['lat'], candidate['lng']),
                "area_sqm": area_sqm,  # Real calculated area
                # Environmental features from real tree data
                "environmental": {
                    "tree_coverage_percent": max(5, int(tree_response.metrics.shade_coverage_percent)),  # Use actual crown area calculation
                    "shade_quality": calculate_enhanced_shade_quality(tree_response, area_sqm),
                    "noise_level": 2,  # Default
                    "wildlife_diversity_score": tree_response.metrics.species_diversity_score,
                    "water_features": detect_water_features(candidate),
                    "natural_surface_percent": 80
                },
                # Real tree data
                "tree_data": {
                    "total_trees": tree_response.metrics.total_trees,
                    "trees_per_hectare": tree_response.metrics.trees_per_hectare,
                    "species_count": len(tree_response.metrics.dominant_species),
                    "species_diversity_score": tree_response.metrics.species_diversity_score,
                    "mature_trees_count": tree_response.metrics.mature_trees_count,
                    "young_trees_count": tree_response.metrics.young_trees_count,
                    "average_tree_age": tree_response.metrics.average_tree_age,
                    "average_height": tree_response.metrics.average_height,
                    "average_crown_diameter": tree_response.metrics.average_crown_diameter,
                    "shade_coverage_percent": tree_response.metrics.shade_coverage_percent,
                    "dominant_species": tree_response.metrics.dominant_species[:3]
                },
                # Real toilet data
                "toilet_accessibility": {
                    "nearby_toilets_count": len(nearby_toilets),
                    "accessibility_score": 80 if nearby_toilets else 30,
                    "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None,
                    "free_toilets_count": len([t for t in nearby_toilets if t.get('is_free', False)]),
                    "accessible_toilets_count": len([t for t in nearby_toilets if t.get('wheelchair_accessible', False)])
                },
                # Standard features
                "accessibility": {
                    "wheelchair_accessible": True,
                    "public_transport_score": 3,
                    "cycling_infrastructure": True,
                    "parking_availability": 2,
                    "lighting_quality": 3
                },
                "recreation": {
                    "playground_quality": 60 if candidate['type'] == 'park' else 30,
                    "sports_facilities": candidate['type'] == 'recreation_ground',
                    "running_paths": True,
                    "cycling_paths": True,
                    "dog_friendly": True,
                    "bbq_allowed": candidate['type'] in ['park', 'recreation_ground']
                },
                "osm_metadata": {
                    "has_official_name": candidate['has_name'],
                    "tags": candidate['tags'],
                    "source": "quick_osm_processing"
                },
                "last_updated": datetime.now().isoformat(),
                "data_sources": ["quick_osm_scan", "berlin_tree_cadastre", "berlin_toilets"],
                "confidence_score": 90 if candidate['has_name'] else 75
            }
            return enhanced_space, tree_response.metrics.total_trees, len(nearby_toilets)
        except Exception as e:
            print(f"❌ Error processing {candidate['name']}: {e}")
            return None, 0, 0
    # Process candidates in batches with progress bar
    for i in range(0, len(candidates), batch_size):
        batch = candidates[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(candidates) + batch_size - 1)//batch_size}")
        # Process batch concurrently with progress bar
        tasks = [process_candidate(candidate) for candidate in batch]
        results = await asyncio.gather(*tasks)
        # Collect results
        for result, trees, toilets in results:
            if result:
                enhanced_spaces.append(result)
                print(f"✅ {result['name'][:40]:40} - {trees:3d} trees, {toilets} toilets")
        # Small delay between batches to be respectful to APIs
        if i + batch_size < len(candidates):
            await asyncio.sleep(0.5)
    # Save results
    output_file = Path("app/data/processed/quick_berlin_green_spaces.json")
    with_trees = len([s for s in enhanced_spaces if s["tree_data"]["total_trees"] > 0])
    with_toilets = len([s for s in enhanced_spaces if s["toilet_accessibility"]["nearby_toilets_count"] > 0])
    total_trees = sum(s["tree_data"]["total_trees"] for s in enhanced_spaces)
    data = {
        "green_spaces": enhanced_spaces,
        "total_count": len(enhanced_spaces),
        "last_updated": datetime.now().isoformat(),
        "data_sources": ["quick_osm_processing", "berlin_tree_cadastre", "berlin_toilets"],
        "processing_info": {
            "method": "quick_scan_for_named_and_significant_spaces",
            "prioritizes_named_spaces": True,
            "enhanced_with_real_berlin_data": True
        },
        "summary_stats": {
            "total_spaces": len(enhanced_spaces),
            "spaces_with_tree_data": with_trees,
            "spaces_with_toilet_data": with_toilets,
            "total_trees_analyzed": total_trees,
            "tree_coverage": f"{round((with_trees/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%",
            "toilet_coverage": f"{round((with_toilets/len(enhanced_spaces))*100, 1)}%" if enhanced_spaces else "0%"
        }
    }
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\n🎉 Quick processing complete!")
    print(f"📁 Saved: {output_file}")
    print(f"📊 {len(enhanced_spaces)} spaces enhanced")
    print(f"🌲 {with_trees} with tree data, 🚻 {with_toilets} with toilet data")
    print(f"🌿 {total_trees} total trees analyzed")
    print(f"\n✨ Ready to use! This gives you real Berlin green spaces")
    print(f"   with actual tree and toilet data for personality scoring!")
 if __name__ == "__main__":
    asyncio.run(quick_process())
--- a/scripts/test_osm_processing.py
+++ b/scripts/test_osm_processing.py
@ -0,0 +1,169 @@
 #!/usr/bin/env python3
 """
 Test OSM processing with a small sample to verify it works.
 """
 import json
 import asyncio
 import xml.etree.ElementTree as ET
 from pathlib import Path
 from datetime import datetime
 import sys
 import math
 # Add the app directory to Python path
 sys.path.append(str(Path(__file__).parent.parent))
 from app.services.street_tree_service import StreetTreeService
 from app.services.berlin_data_service import BerlinDataService
 async def test_processing():
    """Test the processing with a small sample."""
    print("🧪 Testing OSM processing with sample data...")
    # Initialize services
    tree_service = StreetTreeService()
    berlin_data = BerlinDataService()
    # Parse OSM file and get first 5 green spaces as test
    osm_file = Path("app/data/osm-raw/berlin_green_spaces.osm")
    if not osm_file.exists():
        print("❌ OSM file not found")
        return
    tree = ET.parse(osm_file)
    root = tree.getroot()
    ways = root.findall('.//way')
    print(f"📊 Found {len(ways)} total ways in OSM file")
    # Process first 5 green spaces as test
    sample_spaces = []
    processed_count = 0
    for way in ways:
        if processed_count >= 5:
            break
        # Get tags
        tags = {}
        for tag in way.findall('tag'):
            tags[tag.get('k')] = tag.get('v')
        # Check if it's a green space
        green_space_type = None
        leisure = tags.get('leisure', '')
        landuse = tags.get('landuse', '')
        natural = tags.get('natural', '')
        if leisure in ['park', 'garden', 'nature_reserve']:
            green_space_type = leisure
        elif landuse in ['forest', 'grass', 'park']:
            green_space_type = landuse
        elif natural in ['forest', 'wood']:
            green_space_type = natural
        if not green_space_type:
            continue
        # Get coordinates from first and last node to estimate center
        nd_refs = [nd.get('ref') for nd in way.findall('nd')]
        if len(nd_refs) < 3:
            continue
        # Find first node coordinates
        first_node = root.find(f".//node[@id='{nd_refs[0]}']")
        if first_node is None:
            continue
        lat = float(first_node.get('lat'))
        lng = float(first_node.get('lon'))
        # Simple space data
        space_data = {
            'id': f"test_{way.get('id')}",
            'name': tags.get('name', f"Test {green_space_type} {processed_count + 1}"),
            'fclass': green_space_type,
            'lat': lat,
            'lng': lng,
            'area_sqm': 5000,  # Default for test
            'district': 'Test District'
        }
        sample_spaces.append(space_data)
        processed_count += 1
    print(f"🌳 Testing with {len(sample_spaces)} sample green spaces...")
    # Test enhancement with real data
    enhanced_spaces = []
    for i, space_data in enumerate(sample_spaces, 1):
        print(f"\n[{i}/{len(sample_spaces)}] Testing {space_data['name']}...")
        try:
            # Get real tree data
            tree_response = await tree_service.get_trees_near_location(
                space_data['lat'], space_data['lng'], radius_m=200
            )
            # Get real toilet data  
            nearby_toilets = await berlin_data.get_toilets_near_point(
                space_data['lat'], space_data['lng'], 500
            )
            # Create enhanced data
            enhanced_space = {
                "id": space_data['id'],
                "name": space_data['name'],
                "type": "PARK",
                "coordinates": {
                    "lat": space_data['lat'],
                    "lng": space_data['lng']
                },
                "tree_data": {
                    "total_trees": tree_response.metrics.total_trees,
                    "species_count": len(tree_response.metrics.dominant_species),
                    "dominant_species": tree_response.metrics.dominant_species
                },
                "toilet_accessibility": {
                    "nearby_toilets_count": len(nearby_toilets),
                    "nearest_distance_m": nearby_toilets[0]['distance_meters'] if nearby_toilets else None
                }
            }
            enhanced_spaces.append(enhanced_space)
            trees = tree_response.metrics.total_trees
            toilets = len(nearby_toilets)
            print(f"✅ Success: {trees} trees, {toilets} toilets nearby")
        except Exception as e:
            print(f"❌ Error: {e}")
    # Save test results
    output_file = Path("app/data/processed/test_green_spaces.json")
    test_data = {
        "test_results": enhanced_spaces,
        "total_tested": len(enhanced_spaces),
        "osm_ways_available": len(ways),
        "processing_successful": True,
        "timestamp": datetime.now().isoformat()
    }
    with open(output_file, 'w') as f:
        json.dump(test_data, f, indent=2)
    print(f"\n🎉 Test completed successfully!")
    print(f"📁 Test results saved: {output_file}")
    print(f"📊 Enhanced {len(enhanced_spaces)} sample spaces")
    print(f"💡 Ready to process all {len(ways)} green spaces!")
    return True
 if __name__ == "__main__":
    asyncio.run(test_processing())