berlin-picnic-api/scripts/process_street_trees.py

#!/usr/bin/env python3
"""
Process Berlin Street Trees (Baumkataster) CSV data.
Converts the raw CSV into a structured JSON format for use in the picnic API.
"""

import pandas as pd
import json
from pathlib import Path
from datetime import datetime
import sys
import os

# Add the app directory to the Python path
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

def process_street_trees():
    """Process the street trees CSV file and create a JSON file."""

    # File paths
    raw_file = Path("app/data/raw/Baumkataster_Berlin_-1586189165523919690.csv")
    processed_file = Path("app/data/processed/street_trees.json")

    # Ensure processed directory exists
    processed_file.parent.mkdir(parents=True, exist_ok=True)

    print(f"Reading street trees data from: {raw_file}")

    if not raw_file.exists():
        print(f"Error: Raw file not found at {raw_file}")
        return False

    try:
        # Read the CSV file
        df = pd.read_csv(raw_file, encoding='utf-8')
        print(f"Loaded {len(df)} street trees from CSV")

        # Display column names for debugging
        print("Columns in CSV:", df.columns.tolist())

        # Clean and process the data
        trees = []
        processed_count = 0
        skipped_count = 0

        for idx, row in df.iterrows():
            try:
                # Extract coordinates
                x_coord = row.get('x')
                y_coord = row.get('y')

                # Skip rows with missing coordinates
                if pd.isna(x_coord) or pd.isna(y_coord):
                    skipped_count += 1
                    continue

                # Convert coordinates to lat/lng (assuming they're in EPSG:25833 - ETRS89 / UTM zone 33N)
                # For now, we'll use them as-is and convert later if needed
                # In a real implementation, you'd use a proper coordinate transformation

                # Basic coordinate validation (Berlin area check)
                if not (1480000 <= x_coord <= 1520000 and 6870000 <= y_coord <= 6920000):
                    skipped_count += 1
                    continue

                # Convert UTM to approximate lat/lng for Berlin area
                # This is a rough approximation - in production use proper coordinate transformation
                lat = 52.3 + (y_coord - 6870000) / 111000  # Rough conversion
                lng = 13.0 + (x_coord - 1480000) / 71000   # Rough conversion

                # Validate converted coordinates
                if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8):
                    skipped_count += 1
                    continue

                # Extract tree information
                tree_data = {
                    "id": f"tree_{processed_count + 1}",
                    "object_id": row.get('OBJECTID'),
                    "tree_id": row.get('Baum ID'),
                    "location_number": row.get('Standort Nr'),
                    "identifier": row.get('Kennzeich'),
                    "object_name": row.get('Objektname'),
                    "species_german": row.get('Art'),
                    "species_botanical": row.get('Art Botanisch'),
                    "genus_german": row.get('Gattung'),
                    "genus_botanical": row.get('Gattung Botanisch'),
                    "planting_year": row.get('Pflanzjahr'),
                    "age": row.get('Standalter'),
                    "crown_diameter_m": row.get('Krone Durchschnitt (m)'),
                    "trunk_circumference_cm": row.get('Stammumfang (cm)'),
                    "height_m": row.get('Höhe (m)'),
                    "district": row.get('Bezirk'),
                    "owner": row.get('Eigentümer'),
                    "category": row.get('Kategorie'),
                    "street": row.get('Straße'),
                    "house_number": row.get('Haus Nr'),
                    "address_addition": row.get('Adresszusatz'),
                    "lat": round(lat, 6),
                    "lng": round(lng, 6),
                    "x_coord": x_coord,
                    "y_coord": y_coord
                }

                # Clean up None values and convert to appropriate types
                for key, value in tree_data.items():
                    if pd.isna(value):
                        tree_data[key] = None
                    elif key in ['planting_year', 'age', 'trunk_circumference_cm'] and value is not None:
                        try:
                            tree_data[key] = int(float(value))
                        except (ValueError, TypeError):
                            tree_data[key] = None
                    elif key in ['crown_diameter_m', 'height_m'] and value is not None:
                        try:
                            tree_data[key] = float(value)
                        except (ValueError, TypeError):
                            tree_data[key] = None
                    elif isinstance(value, str):
                        tree_data[key] = value.strip()

                trees.append(tree_data)
                processed_count += 1

                # Progress indicator
                if processed_count % 10000 == 0:
                    print(f"Processed {processed_count} trees...")

            except Exception as e:
                print(f"Error processing row {idx}: {e}")
                skipped_count += 1
                continue

        # Create the final data structure
        output_data = {
            "street_trees": trees,
            "count": len(trees),
            "processed_count": processed_count,
            "skipped_count": skipped_count,
            "last_updated": datetime.now().isoformat(),
            "source": "baumkataster_csv",
            "coordinate_system": "EPSG:25833_converted_to_WGS84",
            "note": "Coordinates converted from UTM to approximate WGS84. Use proper coordinate transformation in production."
        }

        # Write to JSON file
        print(f"Writing {len(trees)} trees to: {processed_file}")
        with open(processed_file, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)

        print(f"Successfully processed street trees data:")
        print(f"  - Total rows in CSV: {len(df)}")
        print(f"  - Successfully processed: {processed_count}")
        print(f"  - Skipped (invalid data): {skipped_count}")
        print(f"  - Output file: {processed_file}")

        # Display some sample data
        if trees:
            print("\nSample tree data:")
            sample_tree = trees[0]
            for key, value in sample_tree.items():
                print(f"  {key}: {value}")

        return True

    except Exception as e:
        print(f"Error processing street trees data: {e}")
        return False

if __name__ == "__main__":
    success = process_street_trees()
    if success:
        print("\nStreet trees processing completed successfully!")
    else:
        print("\nStreet trees processing failed!")
        sys.exit(1)