177 lines
7.1 KiB
Python
177 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Process Berlin Street Trees (Baumkataster) CSV data.
|
|
Converts the raw CSV into a structured JSON format for use in the picnic API.
|
|
"""
|
|
|
|
import pandas as pd
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import sys
|
|
import os
|
|
|
|
# Add the app directory to the Python path
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
|
|
|
def process_street_trees():
|
|
"""Process the street trees CSV file and create a JSON file."""
|
|
|
|
# File paths
|
|
raw_file = Path("app/data/raw/Baumkataster_Berlin_-1586189165523919690.csv")
|
|
processed_file = Path("app/data/processed/street_trees.json")
|
|
|
|
# Ensure processed directory exists
|
|
processed_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Reading street trees data from: {raw_file}")
|
|
|
|
if not raw_file.exists():
|
|
print(f"Error: Raw file not found at {raw_file}")
|
|
return False
|
|
|
|
try:
|
|
# Read the CSV file
|
|
df = pd.read_csv(raw_file, encoding='utf-8')
|
|
print(f"Loaded {len(df)} street trees from CSV")
|
|
|
|
# Display column names for debugging
|
|
print("Columns in CSV:", df.columns.tolist())
|
|
|
|
# Clean and process the data
|
|
trees = []
|
|
processed_count = 0
|
|
skipped_count = 0
|
|
|
|
for idx, row in df.iterrows():
|
|
try:
|
|
# Extract coordinates
|
|
x_coord = row.get('x')
|
|
y_coord = row.get('y')
|
|
|
|
# Skip rows with missing coordinates
|
|
if pd.isna(x_coord) or pd.isna(y_coord):
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Convert coordinates to lat/lng (assuming they're in EPSG:25833 - ETRS89 / UTM zone 33N)
|
|
# For now, we'll use them as-is and convert later if needed
|
|
# In a real implementation, you'd use a proper coordinate transformation
|
|
|
|
# Basic coordinate validation (Berlin area check)
|
|
if not (1480000 <= x_coord <= 1520000 and 6870000 <= y_coord <= 6920000):
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Convert UTM to approximate lat/lng for Berlin area
|
|
# This is a rough approximation - in production use proper coordinate transformation
|
|
lat = 52.3 + (y_coord - 6870000) / 111000 # Rough conversion
|
|
lng = 13.0 + (x_coord - 1480000) / 71000 # Rough conversion
|
|
|
|
# Validate converted coordinates
|
|
if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8):
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Extract tree information
|
|
tree_data = {
|
|
"id": f"tree_{processed_count + 1}",
|
|
"object_id": row.get('OBJECTID'),
|
|
"tree_id": row.get('Baum ID'),
|
|
"location_number": row.get('Standort Nr'),
|
|
"identifier": row.get('Kennzeich'),
|
|
"object_name": row.get('Objektname'),
|
|
"species_german": row.get('Art'),
|
|
"species_botanical": row.get('Art Botanisch'),
|
|
"genus_german": row.get('Gattung'),
|
|
"genus_botanical": row.get('Gattung Botanisch'),
|
|
"planting_year": row.get('Pflanzjahr'),
|
|
"age": row.get('Standalter'),
|
|
"crown_diameter_m": row.get('Krone Durchschnitt (m)'),
|
|
"trunk_circumference_cm": row.get('Stammumfang (cm)'),
|
|
"height_m": row.get('Höhe (m)'),
|
|
"district": row.get('Bezirk'),
|
|
"owner": row.get('Eigentümer'),
|
|
"category": row.get('Kategorie'),
|
|
"street": row.get('Straße'),
|
|
"house_number": row.get('Haus Nr'),
|
|
"address_addition": row.get('Adresszusatz'),
|
|
"lat": round(lat, 6),
|
|
"lng": round(lng, 6),
|
|
"x_coord": x_coord,
|
|
"y_coord": y_coord
|
|
}
|
|
|
|
# Clean up None values and convert to appropriate types
|
|
for key, value in tree_data.items():
|
|
if pd.isna(value):
|
|
tree_data[key] = None
|
|
elif key in ['planting_year', 'age', 'trunk_circumference_cm'] and value is not None:
|
|
try:
|
|
tree_data[key] = int(float(value))
|
|
except (ValueError, TypeError):
|
|
tree_data[key] = None
|
|
elif key in ['crown_diameter_m', 'height_m'] and value is not None:
|
|
try:
|
|
tree_data[key] = float(value)
|
|
except (ValueError, TypeError):
|
|
tree_data[key] = None
|
|
elif isinstance(value, str):
|
|
tree_data[key] = value.strip()
|
|
|
|
trees.append(tree_data)
|
|
processed_count += 1
|
|
|
|
# Progress indicator
|
|
if processed_count % 10000 == 0:
|
|
print(f"Processed {processed_count} trees...")
|
|
|
|
except Exception as e:
|
|
print(f"Error processing row {idx}: {e}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Create the final data structure
|
|
output_data = {
|
|
"street_trees": trees,
|
|
"count": len(trees),
|
|
"processed_count": processed_count,
|
|
"skipped_count": skipped_count,
|
|
"last_updated": datetime.now().isoformat(),
|
|
"source": "baumkataster_csv",
|
|
"coordinate_system": "EPSG:25833_converted_to_WGS84",
|
|
"note": "Coordinates converted from UTM to approximate WGS84. Use proper coordinate transformation in production."
|
|
}
|
|
|
|
# Write to JSON file
|
|
print(f"Writing {len(trees)} trees to: {processed_file}")
|
|
with open(processed_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Successfully processed street trees data:")
|
|
print(f" - Total rows in CSV: {len(df)}")
|
|
print(f" - Successfully processed: {processed_count}")
|
|
print(f" - Skipped (invalid data): {skipped_count}")
|
|
print(f" - Output file: {processed_file}")
|
|
|
|
# Display some sample data
|
|
if trees:
|
|
print("\nSample tree data:")
|
|
sample_tree = trees[0]
|
|
for key, value in sample_tree.items():
|
|
print(f" {key}: {value}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error processing street trees data: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = process_street_trees()
|
|
if success:
|
|
print("\nStreet trees processing completed successfully!")
|
|
else:
|
|
print("\nStreet trees processing failed!")
|
|
sys.exit(1)
|