berlin-picnic-api/scripts/process_street_trees.py

177 lines
7.1 KiB
Python

#!/usr/bin/env python3
"""
Process Berlin Street Trees (Baumkataster) CSV data.
Converts the raw CSV into a structured JSON format for use in the picnic API.
"""
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
import sys
import os
# Add the app directory to the Python path
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
def process_street_trees():
"""Process the street trees CSV file and create a JSON file."""
# File paths
raw_file = Path("app/data/raw/Baumkataster_Berlin_-1586189165523919690.csv")
processed_file = Path("app/data/processed/street_trees.json")
# Ensure processed directory exists
processed_file.parent.mkdir(parents=True, exist_ok=True)
print(f"Reading street trees data from: {raw_file}")
if not raw_file.exists():
print(f"Error: Raw file not found at {raw_file}")
return False
try:
# Read the CSV file
df = pd.read_csv(raw_file, encoding='utf-8')
print(f"Loaded {len(df)} street trees from CSV")
# Display column names for debugging
print("Columns in CSV:", df.columns.tolist())
# Clean and process the data
trees = []
processed_count = 0
skipped_count = 0
for idx, row in df.iterrows():
try:
# Extract coordinates
x_coord = row.get('x')
y_coord = row.get('y')
# Skip rows with missing coordinates
if pd.isna(x_coord) or pd.isna(y_coord):
skipped_count += 1
continue
# Convert coordinates to lat/lng (assuming they're in EPSG:25833 - ETRS89 / UTM zone 33N)
# For now, we'll use them as-is and convert later if needed
# In a real implementation, you'd use a proper coordinate transformation
# Basic coordinate validation (Berlin area check)
if not (1480000 <= x_coord <= 1520000 and 6870000 <= y_coord <= 6920000):
skipped_count += 1
continue
# Convert UTM to approximate lat/lng for Berlin area
# This is a rough approximation - in production use proper coordinate transformation
lat = 52.3 + (y_coord - 6870000) / 111000 # Rough conversion
lng = 13.0 + (x_coord - 1480000) / 71000 # Rough conversion
# Validate converted coordinates
if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8):
skipped_count += 1
continue
# Extract tree information
tree_data = {
"id": f"tree_{processed_count + 1}",
"object_id": row.get('OBJECTID'),
"tree_id": row.get('Baum ID'),
"location_number": row.get('Standort Nr'),
"identifier": row.get('Kennzeich'),
"object_name": row.get('Objektname'),
"species_german": row.get('Art'),
"species_botanical": row.get('Art Botanisch'),
"genus_german": row.get('Gattung'),
"genus_botanical": row.get('Gattung Botanisch'),
"planting_year": row.get('Pflanzjahr'),
"age": row.get('Standalter'),
"crown_diameter_m": row.get('Krone Durchschnitt (m)'),
"trunk_circumference_cm": row.get('Stammumfang (cm)'),
"height_m": row.get('Höhe (m)'),
"district": row.get('Bezirk'),
"owner": row.get('Eigentümer'),
"category": row.get('Kategorie'),
"street": row.get('Straße'),
"house_number": row.get('Haus Nr'),
"address_addition": row.get('Adresszusatz'),
"lat": round(lat, 6),
"lng": round(lng, 6),
"x_coord": x_coord,
"y_coord": y_coord
}
# Clean up None values and convert to appropriate types
for key, value in tree_data.items():
if pd.isna(value):
tree_data[key] = None
elif key in ['planting_year', 'age', 'trunk_circumference_cm'] and value is not None:
try:
tree_data[key] = int(float(value))
except (ValueError, TypeError):
tree_data[key] = None
elif key in ['crown_diameter_m', 'height_m'] and value is not None:
try:
tree_data[key] = float(value)
except (ValueError, TypeError):
tree_data[key] = None
elif isinstance(value, str):
tree_data[key] = value.strip()
trees.append(tree_data)
processed_count += 1
# Progress indicator
if processed_count % 10000 == 0:
print(f"Processed {processed_count} trees...")
except Exception as e:
print(f"Error processing row {idx}: {e}")
skipped_count += 1
continue
# Create the final data structure
output_data = {
"street_trees": trees,
"count": len(trees),
"processed_count": processed_count,
"skipped_count": skipped_count,
"last_updated": datetime.now().isoformat(),
"source": "baumkataster_csv",
"coordinate_system": "EPSG:25833_converted_to_WGS84",
"note": "Coordinates converted from UTM to approximate WGS84. Use proper coordinate transformation in production."
}
# Write to JSON file
print(f"Writing {len(trees)} trees to: {processed_file}")
with open(processed_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"Successfully processed street trees data:")
print(f" - Total rows in CSV: {len(df)}")
print(f" - Successfully processed: {processed_count}")
print(f" - Skipped (invalid data): {skipped_count}")
print(f" - Output file: {processed_file}")
# Display some sample data
if trees:
print("\nSample tree data:")
sample_tree = trees[0]
for key, value in sample_tree.items():
print(f" {key}: {value}")
return True
except Exception as e:
print(f"Error processing street trees data: {e}")
return False
if __name__ == "__main__":
success = process_street_trees()
if success:
print("\nStreet trees processing completed successfully!")
else:
print("\nStreet trees processing failed!")
sys.exit(1)