berlin-picnic-api/app/services/data_integration/berlin_open_data.py

401 lines
15 KiB
Python

"""
Berlin Open Data integration client.
This module handles fetching and processing data from Berlin's Open Data Portal.
"""
import httpx
import json
from typing import List, Dict, Any, Optional
from pathlib import Path
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
class BerlinOpenDataClient:
"""Client for fetching data from Berlin Open Data Portal."""
def __init__(self):
# Updated URLs based on user specification
self.green_spaces_wfs = "https://fbinter.stadt-berlin.de/fb/wfs/data/senstadt/s_gruenanlage"
self.green_spaces_wms = "https://fbinter.stadt-berlin.de/fb/wms/senstadt/k_gruenanlage"
self.base_url = "https://fbinter.stadt-berlin.de/fb/wfs/data/senstadt"
self.data_dir = Path("app/data")
self.timeout = 30.0
# Parameters for WFS request as specified by user
self.green_spaces_params = {
"service": "WFS",
"version": "2.0.0",
"request": "GetFeature",
"typeName": "fis:s_gruenanlage",
"outputFormat": "application/json",
"srsName": "EPSG:4326" # WGS84 coordinate system
}
# Create data directories
self.raw_dir = self.data_dir / "raw"
self.processed_dir = self.data_dir / "processed"
self.raw_dir.mkdir(parents=True, exist_ok=True)
self.processed_dir.mkdir(parents=True, exist_ok=True)
async def fetch_green_spaces(self) -> List[Dict[str, Any]]:
"""
Fetch green spaces from Berlin WFS service using the specified s_gruenanlage endpoint.
Returns:
List of GeoJSON features representing green spaces
Raises:
httpx.HTTPError: If the request fails
ValueError: If the response format is invalid
"""
logger.info("Fetching green spaces from Berlin Open Data (s_gruenanlage)...")
# Primary endpoint using user-specified parameters
primary_endpoint = {
'url': self.green_spaces_wfs,
'params': self.green_spaces_params.copy()
}
# Fallback endpoints with alternative parameters
fallback_endpoints = [
{
'url': self.green_spaces_wfs,
'params': {
'service': 'WFS',
'version': '1.1.0',
'request': 'GetFeature',
'typeName': 'fis:s_gruenanlage',
'outputFormat': 'application/json',
'srsName': 'EPSG:4326'
}
},
{
'url': self.green_spaces_wfs,
'params': {
'service': 'WFS',
'version': '2.0.0',
'request': 'GetFeature',
'typeNames': 'fis:s_gruenanlage',
'outputFormat': 'application/json',
'srsName': 'EPSG:4326'
}
},
{
'url': self.green_spaces_wfs,
'params': {
'service': 'WFS',
'version': '1.1.0',
'request': 'GetFeature',
'typeName': 's_gruenanlage',
'outputFormat': 'application/json',
'srsName': 'EPSG:4326'
}
}
]
# Combine primary and fallback endpoints
endpoints_to_try = [primary_endpoint] + fallback_endpoints
last_error = None
for i, endpoint in enumerate(endpoints_to_try):
try:
url = endpoint['url']
params = endpoint['params']
logger.info(f"Trying endpoint {i+1}/{len(endpoints_to_try)}: {url}")
logger.debug(f"Parameters: {params}")
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.get(url, params=params)
# Log response details for debugging
logger.debug(f"Response status: {response.status_code}")
logger.debug(f"Response headers: {dict(response.headers)}")
if response.status_code == 200:
# Parse JSON response
data = response.json()
# Validate response structure
if 'features' in data:
features = data['features']
logger.info(f"Successfully fetched {len(features)} green spaces using endpoint {i+1}")
# Save raw data for debugging/backup
await self._save_raw_data(data, "berlin_green_spaces_gruenanlage.geojson")
return features
else:
logger.warning(f"Endpoint {i+1} returned data without 'features' field")
# Log the response structure for debugging
logger.debug(f"Response keys: {list(data.keys()) if isinstance(data, dict) else 'Not a dict'}")
continue
else:
logger.warning(f"Endpoint {i+1} returned status {response.status_code}")
# Try to get error details
try:
error_text = response.text[:500] # First 500 chars
logger.debug(f"Error response: {error_text}")
except:
pass
continue
except Exception as e:
logger.warning(f"Endpoint {i+1} failed: {e}")
last_error = e
continue
# If we get here, all endpoints failed
if last_error:
raise last_error
else:
raise ValueError("All WFS endpoints failed to return valid data")
def process_green_space_feature(self, feature: Dict) -> Optional[Dict[str, Any]]:
"""
Process a single green space feature into our standardized format.
Args:
feature: GeoJSON feature from Berlin Open Data
Returns:
Processed green space data or None if processing fails
"""
try:
properties = feature.get('properties', {})
geometry = feature.get('geometry', {})
# Skip features without essential data
if not properties.get('gruenanlage') or not geometry:
logger.warning(f"Skipping feature with missing essential data: {properties.get('gml_id', 'unknown')}")
return None
# Extract coordinates (centroid for polygon)
coords = self._extract_centroid(geometry)
if not coords:
logger.warning(f"Could not extract coordinates for feature: {properties.get('gml_id', 'unknown')}")
return None
# Calculate area in square meters
area_ha = properties.get('flaeche_ha')
area_sqm = 0
if area_ha:
try:
area_sqm = int(float(area_ha) * 10000)
except (ValueError, TypeError):
logger.warning(f"Invalid area value for feature {properties.get('gml_id')}: {area_ha}")
# Clean and validate name
name = str(properties.get('gruenanlage', 'Unnamed Green Space')).strip()
if not name or name.lower() in ['null', 'none', '']:
name = 'Unnamed Green Space'
# Clean district and sub-district names
district = str(properties.get('bezirk', '')).strip()
sub_district = str(properties.get('ortsteil', '')).strip()
# Normalize category
category = str(properties.get('kategorie', 'park')).strip().lower()
processed_data = {
'id': f"berlin_{properties.get('gml_id', 'unknown')}",
'name': name,
'district': district,
'sub_district': sub_district,
'category': category,
'area_sqm': area_sqm,
'coordinates': coords,
'geometry': geometry, # Keep full geometry for future spatial operations
'data_source': 'berlin_open_data',
'last_updated': datetime.now().isoformat(),
'raw_properties': properties # Keep all original data for debugging
}
return processed_data
except Exception as e:
logger.error(f"Error processing green space feature: {e}")
return None
def _extract_centroid(self, geometry: Dict) -> Optional[Dict[str, float]]:
"""
Extract centroid coordinates from GeoJSON geometry.
Args:
geometry: GeoJSON geometry object
Returns:
Dictionary with 'lat' and 'lng' keys or None if extraction fails
"""
try:
geom_type = geometry.get('type')
coordinates = geometry.get('coordinates')
if not coordinates:
return None
if geom_type == 'Polygon':
# For polygon, use centroid of outer ring
outer_ring = coordinates[0]
if len(outer_ring) < 3:
return None
# Calculate centroid
lats = [coord[1] for coord in outer_ring if len(coord) >= 2]
lngs = [coord[0] for coord in outer_ring if len(coord) >= 2]
if not lats or not lngs:
return None
return {
'lat': sum(lats) / len(lats),
'lng': sum(lngs) / len(lngs)
}
elif geom_type == 'Point':
if len(coordinates) >= 2:
return {
'lat': coordinates[1],
'lng': coordinates[0]
}
elif geom_type == 'MultiPolygon':
# For multipolygon, use centroid of first polygon
if coordinates and len(coordinates) > 0:
first_polygon = coordinates[0]
if first_polygon and len(first_polygon) > 0:
outer_ring = first_polygon[0]
lats = [coord[1] for coord in outer_ring if len(coord) >= 2]
lngs = [coord[0] for coord in outer_ring if len(coord) >= 2]
if lats and lngs:
return {
'lat': sum(lats) / len(lats),
'lng': sum(lngs) / len(lngs)
}
# Fallback: return None for unsupported geometry types
logger.warning(f"Unsupported geometry type: {geom_type}")
return None
except Exception as e:
logger.error(f"Error extracting centroid: {e}")
return None
async def _save_raw_data(self, data: Dict, filename: str) -> None:
"""
Save raw data to file for backup/debugging.
Args:
data: Raw data to save
filename: Name of the file to save to
"""
try:
raw_file = self.raw_dir / filename
with open(raw_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.debug(f"Saved raw data to {raw_file}")
except Exception as e:
logger.warning(f"Failed to save raw data: {e}")
def validate_coordinates(self, lat: float, lng: float) -> bool:
"""
Validate that coordinates are within Berlin bounds.
Args:
lat: Latitude
lng: Longitude
Returns:
True if coordinates are within Berlin bounds
"""
# Berlin approximate bounds
BERLIN_BOUNDS = {
'lat_min': 52.3,
'lat_max': 52.7,
'lng_min': 13.0,
'lng_max': 13.8
}
return (
BERLIN_BOUNDS['lat_min'] <= lat <= BERLIN_BOUNDS['lat_max'] and
BERLIN_BOUNDS['lng_min'] <= lng <= BERLIN_BOUNDS['lng_max']
)
async def process_and_save_green_spaces(self) -> Dict[str, Any]:
"""
Fetch, process, and save green spaces data.
Returns:
Summary of processing results
"""
logger.info("Starting green spaces data processing...")
try:
# Fetch raw data
raw_features = await self.fetch_green_spaces()
# Process features
processed_parks = []
skipped_count = 0
invalid_coords_count = 0
for feature in raw_features:
processed_park = self.process_green_space_feature(feature)
if processed_park is None:
skipped_count += 1
continue
# Validate coordinates
coords = processed_park['coordinates']
if not self.validate_coordinates(coords['lat'], coords['lng']):
invalid_coords_count += 1
logger.warning(f"Invalid coordinates for park {processed_park['name']}: {coords}")
continue
processed_parks.append(processed_park)
# Save processed data
output_data = {
'parks': processed_parks,
'total_count': len(processed_parks),
'data_source': 'berlin_open_data',
'last_updated': datetime.now().isoformat(),
'processing_stats': {
'raw_features': len(raw_features),
'processed_parks': len(processed_parks),
'skipped_features': skipped_count,
'invalid_coordinates': invalid_coords_count
}
}
processed_file = self.processed_dir / "parks.json"
with open(processed_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, ensure_ascii=False, indent=2)
logger.info(f"Successfully processed {len(processed_parks)} parks")
logger.info(f"Skipped {skipped_count} features, {invalid_coords_count} had invalid coordinates")
return output_data
except Exception as e:
logger.error(f"Error in process_and_save_green_spaces: {e}")
raise
# Convenience function for easy usage
async def fetch_and_process_berlin_green_spaces() -> Dict[str, Any]:
"""
Convenience function to fetch and process Berlin green spaces.
Returns:
Processing results summary
"""
client = BerlinOpenDataClient()
return await client.process_and_save_green_spaces()