401 lines
15 KiB
Python
401 lines
15 KiB
Python
"""
|
|
Berlin Open Data integration client.
|
|
|
|
This module handles fetching and processing data from Berlin's Open Data Portal.
|
|
"""
|
|
|
|
import httpx
|
|
import json
|
|
from typing import List, Dict, Any, Optional
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BerlinOpenDataClient:
|
|
"""Client for fetching data from Berlin Open Data Portal."""
|
|
|
|
def __init__(self):
|
|
# Updated URLs based on user specification
|
|
self.green_spaces_wfs = "https://fbinter.stadt-berlin.de/fb/wfs/data/senstadt/s_gruenanlage"
|
|
self.green_spaces_wms = "https://fbinter.stadt-berlin.de/fb/wms/senstadt/k_gruenanlage"
|
|
self.base_url = "https://fbinter.stadt-berlin.de/fb/wfs/data/senstadt"
|
|
self.data_dir = Path("app/data")
|
|
self.timeout = 30.0
|
|
|
|
# Parameters for WFS request as specified by user
|
|
self.green_spaces_params = {
|
|
"service": "WFS",
|
|
"version": "2.0.0",
|
|
"request": "GetFeature",
|
|
"typeName": "fis:s_gruenanlage",
|
|
"outputFormat": "application/json",
|
|
"srsName": "EPSG:4326" # WGS84 coordinate system
|
|
}
|
|
|
|
# Create data directories
|
|
self.raw_dir = self.data_dir / "raw"
|
|
self.processed_dir = self.data_dir / "processed"
|
|
self.raw_dir.mkdir(parents=True, exist_ok=True)
|
|
self.processed_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
async def fetch_green_spaces(self) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fetch green spaces from Berlin WFS service using the specified s_gruenanlage endpoint.
|
|
|
|
Returns:
|
|
List of GeoJSON features representing green spaces
|
|
|
|
Raises:
|
|
httpx.HTTPError: If the request fails
|
|
ValueError: If the response format is invalid
|
|
"""
|
|
logger.info("Fetching green spaces from Berlin Open Data (s_gruenanlage)...")
|
|
|
|
# Primary endpoint using user-specified parameters
|
|
primary_endpoint = {
|
|
'url': self.green_spaces_wfs,
|
|
'params': self.green_spaces_params.copy()
|
|
}
|
|
|
|
# Fallback endpoints with alternative parameters
|
|
fallback_endpoints = [
|
|
{
|
|
'url': self.green_spaces_wfs,
|
|
'params': {
|
|
'service': 'WFS',
|
|
'version': '1.1.0',
|
|
'request': 'GetFeature',
|
|
'typeName': 'fis:s_gruenanlage',
|
|
'outputFormat': 'application/json',
|
|
'srsName': 'EPSG:4326'
|
|
}
|
|
},
|
|
{
|
|
'url': self.green_spaces_wfs,
|
|
'params': {
|
|
'service': 'WFS',
|
|
'version': '2.0.0',
|
|
'request': 'GetFeature',
|
|
'typeNames': 'fis:s_gruenanlage',
|
|
'outputFormat': 'application/json',
|
|
'srsName': 'EPSG:4326'
|
|
}
|
|
},
|
|
{
|
|
'url': self.green_spaces_wfs,
|
|
'params': {
|
|
'service': 'WFS',
|
|
'version': '1.1.0',
|
|
'request': 'GetFeature',
|
|
'typeName': 's_gruenanlage',
|
|
'outputFormat': 'application/json',
|
|
'srsName': 'EPSG:4326'
|
|
}
|
|
}
|
|
]
|
|
|
|
# Combine primary and fallback endpoints
|
|
endpoints_to_try = [primary_endpoint] + fallback_endpoints
|
|
last_error = None
|
|
|
|
for i, endpoint in enumerate(endpoints_to_try):
|
|
try:
|
|
url = endpoint['url']
|
|
params = endpoint['params']
|
|
|
|
logger.info(f"Trying endpoint {i+1}/{len(endpoints_to_try)}: {url}")
|
|
logger.debug(f"Parameters: {params}")
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
response = await client.get(url, params=params)
|
|
|
|
# Log response details for debugging
|
|
logger.debug(f"Response status: {response.status_code}")
|
|
logger.debug(f"Response headers: {dict(response.headers)}")
|
|
|
|
if response.status_code == 200:
|
|
# Parse JSON response
|
|
data = response.json()
|
|
|
|
# Validate response structure
|
|
if 'features' in data:
|
|
features = data['features']
|
|
logger.info(f"Successfully fetched {len(features)} green spaces using endpoint {i+1}")
|
|
|
|
# Save raw data for debugging/backup
|
|
await self._save_raw_data(data, "berlin_green_spaces_gruenanlage.geojson")
|
|
|
|
return features
|
|
else:
|
|
logger.warning(f"Endpoint {i+1} returned data without 'features' field")
|
|
# Log the response structure for debugging
|
|
logger.debug(f"Response keys: {list(data.keys()) if isinstance(data, dict) else 'Not a dict'}")
|
|
continue
|
|
else:
|
|
logger.warning(f"Endpoint {i+1} returned status {response.status_code}")
|
|
# Try to get error details
|
|
try:
|
|
error_text = response.text[:500] # First 500 chars
|
|
logger.debug(f"Error response: {error_text}")
|
|
except:
|
|
pass
|
|
continue
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Endpoint {i+1} failed: {e}")
|
|
last_error = e
|
|
continue
|
|
|
|
# If we get here, all endpoints failed
|
|
if last_error:
|
|
raise last_error
|
|
else:
|
|
raise ValueError("All WFS endpoints failed to return valid data")
|
|
|
|
def process_green_space_feature(self, feature: Dict) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Process a single green space feature into our standardized format.
|
|
|
|
Args:
|
|
feature: GeoJSON feature from Berlin Open Data
|
|
|
|
Returns:
|
|
Processed green space data or None if processing fails
|
|
"""
|
|
try:
|
|
properties = feature.get('properties', {})
|
|
geometry = feature.get('geometry', {})
|
|
|
|
# Skip features without essential data
|
|
if not properties.get('gruenanlage') or not geometry:
|
|
logger.warning(f"Skipping feature with missing essential data: {properties.get('gml_id', 'unknown')}")
|
|
return None
|
|
|
|
# Extract coordinates (centroid for polygon)
|
|
coords = self._extract_centroid(geometry)
|
|
if not coords:
|
|
logger.warning(f"Could not extract coordinates for feature: {properties.get('gml_id', 'unknown')}")
|
|
return None
|
|
|
|
# Calculate area in square meters
|
|
area_ha = properties.get('flaeche_ha')
|
|
area_sqm = 0
|
|
if area_ha:
|
|
try:
|
|
area_sqm = int(float(area_ha) * 10000)
|
|
except (ValueError, TypeError):
|
|
logger.warning(f"Invalid area value for feature {properties.get('gml_id')}: {area_ha}")
|
|
|
|
# Clean and validate name
|
|
name = str(properties.get('gruenanlage', 'Unnamed Green Space')).strip()
|
|
if not name or name.lower() in ['null', 'none', '']:
|
|
name = 'Unnamed Green Space'
|
|
|
|
# Clean district and sub-district names
|
|
district = str(properties.get('bezirk', '')).strip()
|
|
sub_district = str(properties.get('ortsteil', '')).strip()
|
|
|
|
# Normalize category
|
|
category = str(properties.get('kategorie', 'park')).strip().lower()
|
|
|
|
processed_data = {
|
|
'id': f"berlin_{properties.get('gml_id', 'unknown')}",
|
|
'name': name,
|
|
'district': district,
|
|
'sub_district': sub_district,
|
|
'category': category,
|
|
'area_sqm': area_sqm,
|
|
'coordinates': coords,
|
|
'geometry': geometry, # Keep full geometry for future spatial operations
|
|
'data_source': 'berlin_open_data',
|
|
'last_updated': datetime.now().isoformat(),
|
|
'raw_properties': properties # Keep all original data for debugging
|
|
}
|
|
|
|
return processed_data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing green space feature: {e}")
|
|
return None
|
|
|
|
def _extract_centroid(self, geometry: Dict) -> Optional[Dict[str, float]]:
|
|
"""
|
|
Extract centroid coordinates from GeoJSON geometry.
|
|
|
|
Args:
|
|
geometry: GeoJSON geometry object
|
|
|
|
Returns:
|
|
Dictionary with 'lat' and 'lng' keys or None if extraction fails
|
|
"""
|
|
try:
|
|
geom_type = geometry.get('type')
|
|
coordinates = geometry.get('coordinates')
|
|
|
|
if not coordinates:
|
|
return None
|
|
|
|
if geom_type == 'Polygon':
|
|
# For polygon, use centroid of outer ring
|
|
outer_ring = coordinates[0]
|
|
if len(outer_ring) < 3:
|
|
return None
|
|
|
|
# Calculate centroid
|
|
lats = [coord[1] for coord in outer_ring if len(coord) >= 2]
|
|
lngs = [coord[0] for coord in outer_ring if len(coord) >= 2]
|
|
|
|
if not lats or not lngs:
|
|
return None
|
|
|
|
return {
|
|
'lat': sum(lats) / len(lats),
|
|
'lng': sum(lngs) / len(lngs)
|
|
}
|
|
|
|
elif geom_type == 'Point':
|
|
if len(coordinates) >= 2:
|
|
return {
|
|
'lat': coordinates[1],
|
|
'lng': coordinates[0]
|
|
}
|
|
|
|
elif geom_type == 'MultiPolygon':
|
|
# For multipolygon, use centroid of first polygon
|
|
if coordinates and len(coordinates) > 0:
|
|
first_polygon = coordinates[0]
|
|
if first_polygon and len(first_polygon) > 0:
|
|
outer_ring = first_polygon[0]
|
|
lats = [coord[1] for coord in outer_ring if len(coord) >= 2]
|
|
lngs = [coord[0] for coord in outer_ring if len(coord) >= 2]
|
|
|
|
if lats and lngs:
|
|
return {
|
|
'lat': sum(lats) / len(lats),
|
|
'lng': sum(lngs) / len(lngs)
|
|
}
|
|
|
|
# Fallback: return None for unsupported geometry types
|
|
logger.warning(f"Unsupported geometry type: {geom_type}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting centroid: {e}")
|
|
return None
|
|
|
|
async def _save_raw_data(self, data: Dict, filename: str) -> None:
|
|
"""
|
|
Save raw data to file for backup/debugging.
|
|
|
|
Args:
|
|
data: Raw data to save
|
|
filename: Name of the file to save to
|
|
"""
|
|
try:
|
|
raw_file = self.raw_dir / filename
|
|
with open(raw_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
logger.debug(f"Saved raw data to {raw_file}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to save raw data: {e}")
|
|
|
|
def validate_coordinates(self, lat: float, lng: float) -> bool:
|
|
"""
|
|
Validate that coordinates are within Berlin bounds.
|
|
|
|
Args:
|
|
lat: Latitude
|
|
lng: Longitude
|
|
|
|
Returns:
|
|
True if coordinates are within Berlin bounds
|
|
"""
|
|
# Berlin approximate bounds
|
|
BERLIN_BOUNDS = {
|
|
'lat_min': 52.3,
|
|
'lat_max': 52.7,
|
|
'lng_min': 13.0,
|
|
'lng_max': 13.8
|
|
}
|
|
|
|
return (
|
|
BERLIN_BOUNDS['lat_min'] <= lat <= BERLIN_BOUNDS['lat_max'] and
|
|
BERLIN_BOUNDS['lng_min'] <= lng <= BERLIN_BOUNDS['lng_max']
|
|
)
|
|
|
|
async def process_and_save_green_spaces(self) -> Dict[str, Any]:
|
|
"""
|
|
Fetch, process, and save green spaces data.
|
|
|
|
Returns:
|
|
Summary of processing results
|
|
"""
|
|
logger.info("Starting green spaces data processing...")
|
|
|
|
try:
|
|
# Fetch raw data
|
|
raw_features = await self.fetch_green_spaces()
|
|
|
|
# Process features
|
|
processed_parks = []
|
|
skipped_count = 0
|
|
invalid_coords_count = 0
|
|
|
|
for feature in raw_features:
|
|
processed_park = self.process_green_space_feature(feature)
|
|
|
|
if processed_park is None:
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Validate coordinates
|
|
coords = processed_park['coordinates']
|
|
if not self.validate_coordinates(coords['lat'], coords['lng']):
|
|
invalid_coords_count += 1
|
|
logger.warning(f"Invalid coordinates for park {processed_park['name']}: {coords}")
|
|
continue
|
|
|
|
processed_parks.append(processed_park)
|
|
|
|
# Save processed data
|
|
output_data = {
|
|
'parks': processed_parks,
|
|
'total_count': len(processed_parks),
|
|
'data_source': 'berlin_open_data',
|
|
'last_updated': datetime.now().isoformat(),
|
|
'processing_stats': {
|
|
'raw_features': len(raw_features),
|
|
'processed_parks': len(processed_parks),
|
|
'skipped_features': skipped_count,
|
|
'invalid_coordinates': invalid_coords_count
|
|
}
|
|
}
|
|
|
|
processed_file = self.processed_dir / "parks.json"
|
|
with open(processed_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output_data, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"Successfully processed {len(processed_parks)} parks")
|
|
logger.info(f"Skipped {skipped_count} features, {invalid_coords_count} had invalid coordinates")
|
|
|
|
return output_data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in process_and_save_green_spaces: {e}")
|
|
raise
|
|
|
|
|
|
# Convenience function for easy usage
|
|
async def fetch_and_process_berlin_green_spaces() -> Dict[str, Any]:
|
|
"""
|
|
Convenience function to fetch and process Berlin green spaces.
|
|
|
|
Returns:
|
|
Processing results summary
|
|
"""
|
|
client = BerlinOpenDataClient()
|
|
return await client.process_and_save_green_spaces()
|