#!/usr/bin/env python3 """ Process Berlin Street Trees (Baumkataster) CSV data. Converts the raw CSV into a structured JSON format for use in the picnic API. """ import pandas as pd import json from pathlib import Path from datetime import datetime import sys import os # Add the app directory to the Python path sys.path.append(os.path.join(os.path.dirname(__file__), '..')) def process_street_trees(): """Process the street trees CSV file and create a JSON file.""" # File paths raw_file = Path("app/data/raw/Baumkataster_Berlin_-1586189165523919690.csv") processed_file = Path("app/data/processed/street_trees.json") # Ensure processed directory exists processed_file.parent.mkdir(parents=True, exist_ok=True) print(f"Reading street trees data from: {raw_file}") if not raw_file.exists(): print(f"Error: Raw file not found at {raw_file}") return False try: # Read the CSV file df = pd.read_csv(raw_file, encoding='utf-8') print(f"Loaded {len(df)} street trees from CSV") # Display column names for debugging print("Columns in CSV:", df.columns.tolist()) # Clean and process the data trees = [] processed_count = 0 skipped_count = 0 for idx, row in df.iterrows(): try: # Extract coordinates x_coord = row.get('x') y_coord = row.get('y') # Skip rows with missing coordinates if pd.isna(x_coord) or pd.isna(y_coord): skipped_count += 1 continue # Convert coordinates to lat/lng (assuming they're in EPSG:25833 - ETRS89 / UTM zone 33N) # For now, we'll use them as-is and convert later if needed # In a real implementation, you'd use a proper coordinate transformation # Basic coordinate validation (Berlin area check) if not (1480000 <= x_coord <= 1520000 and 6870000 <= y_coord <= 6920000): skipped_count += 1 continue # Convert UTM to approximate lat/lng for Berlin area # This is a rough approximation - in production use proper coordinate transformation lat = 52.3 + (y_coord - 6870000) / 111000 # Rough conversion lng = 13.0 + (x_coord - 1480000) / 71000 # Rough conversion # Validate converted coordinates if not (52.3 <= lat <= 52.7 and 13.0 <= lng <= 13.8): skipped_count += 1 continue # Extract tree information tree_data = { "id": f"tree_{processed_count + 1}", "object_id": row.get('OBJECTID'), "tree_id": row.get('Baum ID'), "location_number": row.get('Standort Nr'), "identifier": row.get('Kennzeich'), "object_name": row.get('Objektname'), "species_german": row.get('Art'), "species_botanical": row.get('Art Botanisch'), "genus_german": row.get('Gattung'), "genus_botanical": row.get('Gattung Botanisch'), "planting_year": row.get('Pflanzjahr'), "age": row.get('Standalter'), "crown_diameter_m": row.get('Krone Durchschnitt (m)'), "trunk_circumference_cm": row.get('Stammumfang (cm)'), "height_m": row.get('Höhe (m)'), "district": row.get('Bezirk'), "owner": row.get('Eigentümer'), "category": row.get('Kategorie'), "street": row.get('Straße'), "house_number": row.get('Haus Nr'), "address_addition": row.get('Adresszusatz'), "lat": round(lat, 6), "lng": round(lng, 6), "x_coord": x_coord, "y_coord": y_coord } # Clean up None values and convert to appropriate types for key, value in tree_data.items(): if pd.isna(value): tree_data[key] = None elif key in ['planting_year', 'age', 'trunk_circumference_cm'] and value is not None: try: tree_data[key] = int(float(value)) except (ValueError, TypeError): tree_data[key] = None elif key in ['crown_diameter_m', 'height_m'] and value is not None: try: tree_data[key] = float(value) except (ValueError, TypeError): tree_data[key] = None elif isinstance(value, str): tree_data[key] = value.strip() trees.append(tree_data) processed_count += 1 # Progress indicator if processed_count % 10000 == 0: print(f"Processed {processed_count} trees...") except Exception as e: print(f"Error processing row {idx}: {e}") skipped_count += 1 continue # Create the final data structure output_data = { "street_trees": trees, "count": len(trees), "processed_count": processed_count, "skipped_count": skipped_count, "last_updated": datetime.now().isoformat(), "source": "baumkataster_csv", "coordinate_system": "EPSG:25833_converted_to_WGS84", "note": "Coordinates converted from UTM to approximate WGS84. Use proper coordinate transformation in production." } # Write to JSON file print(f"Writing {len(trees)} trees to: {processed_file}") with open(processed_file, 'w', encoding='utf-8') as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print(f"Successfully processed street trees data:") print(f" - Total rows in CSV: {len(df)}") print(f" - Successfully processed: {processed_count}") print(f" - Skipped (invalid data): {skipped_count}") print(f" - Output file: {processed_file}") # Display some sample data if trees: print("\nSample tree data:") sample_tree = trees[0] for key, value in sample_tree.items(): print(f" {key}: {value}") return True except Exception as e: print(f"Error processing street trees data: {e}") return False if __name__ == "__main__": success = process_street_trees() if success: print("\nStreet trees processing completed successfully!") else: print("\nStreet trees processing failed!") sys.exit(1)