""" Berlin Open Data integration client. This module handles fetching and processing data from Berlin's Open Data Portal. """ import httpx import json from typing import List, Dict, Any, Optional from pathlib import Path from datetime import datetime import logging logger = logging.getLogger(__name__) class BerlinOpenDataClient: """Client for fetching data from Berlin Open Data Portal.""" def __init__(self): # Updated URLs based on user specification self.green_spaces_wfs = "https://fbinter.stadt-berlin.de/fb/wfs/data/senstadt/s_gruenanlage" self.green_spaces_wms = "https://fbinter.stadt-berlin.de/fb/wms/senstadt/k_gruenanlage" self.base_url = "https://fbinter.stadt-berlin.de/fb/wfs/data/senstadt" self.data_dir = Path("app/data") self.timeout = 30.0 # Parameters for WFS request as specified by user self.green_spaces_params = { "service": "WFS", "version": "2.0.0", "request": "GetFeature", "typeName": "fis:s_gruenanlage", "outputFormat": "application/json", "srsName": "EPSG:4326" # WGS84 coordinate system } # Create data directories self.raw_dir = self.data_dir / "raw" self.processed_dir = self.data_dir / "processed" self.raw_dir.mkdir(parents=True, exist_ok=True) self.processed_dir.mkdir(parents=True, exist_ok=True) async def fetch_green_spaces(self) -> List[Dict[str, Any]]: """ Fetch green spaces from Berlin WFS service using the specified s_gruenanlage endpoint. Returns: List of GeoJSON features representing green spaces Raises: httpx.HTTPError: If the request fails ValueError: If the response format is invalid """ logger.info("Fetching green spaces from Berlin Open Data (s_gruenanlage)...") # Primary endpoint using user-specified parameters primary_endpoint = { 'url': self.green_spaces_wfs, 'params': self.green_spaces_params.copy() } # Fallback endpoints with alternative parameters fallback_endpoints = [ { 'url': self.green_spaces_wfs, 'params': { 'service': 'WFS', 'version': '1.1.0', 'request': 'GetFeature', 'typeName': 'fis:s_gruenanlage', 'outputFormat': 'application/json', 'srsName': 'EPSG:4326' } }, { 'url': self.green_spaces_wfs, 'params': { 'service': 'WFS', 'version': '2.0.0', 'request': 'GetFeature', 'typeNames': 'fis:s_gruenanlage', 'outputFormat': 'application/json', 'srsName': 'EPSG:4326' } }, { 'url': self.green_spaces_wfs, 'params': { 'service': 'WFS', 'version': '1.1.0', 'request': 'GetFeature', 'typeName': 's_gruenanlage', 'outputFormat': 'application/json', 'srsName': 'EPSG:4326' } } ] # Combine primary and fallback endpoints endpoints_to_try = [primary_endpoint] + fallback_endpoints last_error = None for i, endpoint in enumerate(endpoints_to_try): try: url = endpoint['url'] params = endpoint['params'] logger.info(f"Trying endpoint {i+1}/{len(endpoints_to_try)}: {url}") logger.debug(f"Parameters: {params}") async with httpx.AsyncClient(timeout=self.timeout) as client: response = await client.get(url, params=params) # Log response details for debugging logger.debug(f"Response status: {response.status_code}") logger.debug(f"Response headers: {dict(response.headers)}") if response.status_code == 200: # Parse JSON response data = response.json() # Validate response structure if 'features' in data: features = data['features'] logger.info(f"Successfully fetched {len(features)} green spaces using endpoint {i+1}") # Save raw data for debugging/backup await self._save_raw_data(data, "berlin_green_spaces_gruenanlage.geojson") return features else: logger.warning(f"Endpoint {i+1} returned data without 'features' field") # Log the response structure for debugging logger.debug(f"Response keys: {list(data.keys()) if isinstance(data, dict) else 'Not a dict'}") continue else: logger.warning(f"Endpoint {i+1} returned status {response.status_code}") # Try to get error details try: error_text = response.text[:500] # First 500 chars logger.debug(f"Error response: {error_text}") except: pass continue except Exception as e: logger.warning(f"Endpoint {i+1} failed: {e}") last_error = e continue # If we get here, all endpoints failed if last_error: raise last_error else: raise ValueError("All WFS endpoints failed to return valid data") def process_green_space_feature(self, feature: Dict) -> Optional[Dict[str, Any]]: """ Process a single green space feature into our standardized format. Args: feature: GeoJSON feature from Berlin Open Data Returns: Processed green space data or None if processing fails """ try: properties = feature.get('properties', {}) geometry = feature.get('geometry', {}) # Skip features without essential data if not properties.get('gruenanlage') or not geometry: logger.warning(f"Skipping feature with missing essential data: {properties.get('gml_id', 'unknown')}") return None # Extract coordinates (centroid for polygon) coords = self._extract_centroid(geometry) if not coords: logger.warning(f"Could not extract coordinates for feature: {properties.get('gml_id', 'unknown')}") return None # Calculate area in square meters area_ha = properties.get('flaeche_ha') area_sqm = 0 if area_ha: try: area_sqm = int(float(area_ha) * 10000) except (ValueError, TypeError): logger.warning(f"Invalid area value for feature {properties.get('gml_id')}: {area_ha}") # Clean and validate name name = str(properties.get('gruenanlage', 'Unnamed Green Space')).strip() if not name or name.lower() in ['null', 'none', '']: name = 'Unnamed Green Space' # Clean district and sub-district names district = str(properties.get('bezirk', '')).strip() sub_district = str(properties.get('ortsteil', '')).strip() # Normalize category category = str(properties.get('kategorie', 'park')).strip().lower() processed_data = { 'id': f"berlin_{properties.get('gml_id', 'unknown')}", 'name': name, 'district': district, 'sub_district': sub_district, 'category': category, 'area_sqm': area_sqm, 'coordinates': coords, 'geometry': geometry, # Keep full geometry for future spatial operations 'data_source': 'berlin_open_data', 'last_updated': datetime.now().isoformat(), 'raw_properties': properties # Keep all original data for debugging } return processed_data except Exception as e: logger.error(f"Error processing green space feature: {e}") return None def _extract_centroid(self, geometry: Dict) -> Optional[Dict[str, float]]: """ Extract centroid coordinates from GeoJSON geometry. Args: geometry: GeoJSON geometry object Returns: Dictionary with 'lat' and 'lng' keys or None if extraction fails """ try: geom_type = geometry.get('type') coordinates = geometry.get('coordinates') if not coordinates: return None if geom_type == 'Polygon': # For polygon, use centroid of outer ring outer_ring = coordinates[0] if len(outer_ring) < 3: return None # Calculate centroid lats = [coord[1] for coord in outer_ring if len(coord) >= 2] lngs = [coord[0] for coord in outer_ring if len(coord) >= 2] if not lats or not lngs: return None return { 'lat': sum(lats) / len(lats), 'lng': sum(lngs) / len(lngs) } elif geom_type == 'Point': if len(coordinates) >= 2: return { 'lat': coordinates[1], 'lng': coordinates[0] } elif geom_type == 'MultiPolygon': # For multipolygon, use centroid of first polygon if coordinates and len(coordinates) > 0: first_polygon = coordinates[0] if first_polygon and len(first_polygon) > 0: outer_ring = first_polygon[0] lats = [coord[1] for coord in outer_ring if len(coord) >= 2] lngs = [coord[0] for coord in outer_ring if len(coord) >= 2] if lats and lngs: return { 'lat': sum(lats) / len(lats), 'lng': sum(lngs) / len(lngs) } # Fallback: return None for unsupported geometry types logger.warning(f"Unsupported geometry type: {geom_type}") return None except Exception as e: logger.error(f"Error extracting centroid: {e}") return None async def _save_raw_data(self, data: Dict, filename: str) -> None: """ Save raw data to file for backup/debugging. Args: data: Raw data to save filename: Name of the file to save to """ try: raw_file = self.raw_dir / filename with open(raw_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) logger.debug(f"Saved raw data to {raw_file}") except Exception as e: logger.warning(f"Failed to save raw data: {e}") def validate_coordinates(self, lat: float, lng: float) -> bool: """ Validate that coordinates are within Berlin bounds. Args: lat: Latitude lng: Longitude Returns: True if coordinates are within Berlin bounds """ # Berlin approximate bounds BERLIN_BOUNDS = { 'lat_min': 52.3, 'lat_max': 52.7, 'lng_min': 13.0, 'lng_max': 13.8 } return ( BERLIN_BOUNDS['lat_min'] <= lat <= BERLIN_BOUNDS['lat_max'] and BERLIN_BOUNDS['lng_min'] <= lng <= BERLIN_BOUNDS['lng_max'] ) async def process_and_save_green_spaces(self) -> Dict[str, Any]: """ Fetch, process, and save green spaces data. Returns: Summary of processing results """ logger.info("Starting green spaces data processing...") try: # Fetch raw data raw_features = await self.fetch_green_spaces() # Process features processed_parks = [] skipped_count = 0 invalid_coords_count = 0 for feature in raw_features: processed_park = self.process_green_space_feature(feature) if processed_park is None: skipped_count += 1 continue # Validate coordinates coords = processed_park['coordinates'] if not self.validate_coordinates(coords['lat'], coords['lng']): invalid_coords_count += 1 logger.warning(f"Invalid coordinates for park {processed_park['name']}: {coords}") continue processed_parks.append(processed_park) # Save processed data output_data = { 'parks': processed_parks, 'total_count': len(processed_parks), 'data_source': 'berlin_open_data', 'last_updated': datetime.now().isoformat(), 'processing_stats': { 'raw_features': len(raw_features), 'processed_parks': len(processed_parks), 'skipped_features': skipped_count, 'invalid_coordinates': invalid_coords_count } } processed_file = self.processed_dir / "parks.json" with open(processed_file, 'w', encoding='utf-8') as f: json.dump(output_data, f, ensure_ascii=False, indent=2) logger.info(f"Successfully processed {len(processed_parks)} parks") logger.info(f"Skipped {skipped_count} features, {invalid_coords_count} had invalid coordinates") return output_data except Exception as e: logger.error(f"Error in process_and_save_green_spaces: {e}") raise # Convenience function for easy usage async def fetch_and_process_berlin_green_spaces() -> Dict[str, Any]: """ Convenience function to fetch and process Berlin green spaces. Returns: Processing results summary """ client = BerlinOpenDataClient() return await client.process_and_save_green_spaces()