import duckdb import pandas as pd import os import datetime # Create connection to DuckDB con = duckdb.connect('working_times.db') # Path to the CSV file csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv' # Check if file exists if not os.path.exists(csv_file): print(f"Error: File {csv_file} not found") exit(1) print(f"Importing data from {csv_file}...") # Current timestamp for the import import_timestamp = datetime.datetime.now() print(f"Import timestamp: {import_timestamp}") # First read the CSV file with pandas to handle encoding print("Reading CSV file with proper encoding...") df = pd.read_csv(csv_file, sep=';', encoding='ISO-8859-1', decimal=',') # Create temporary table with CSV data, import timestamp, and hash con.execute(""" CREATE TEMP TABLE temp_working_times AS WITH base_data AS ( SELECT *, '{timestamp}'::TIMESTAMP AS import_timestamp FROM df ) SELECT *, md5( "Datum" || '|' || "Von" || '|' || "Bis" || '|' || "Pause" || '|' || "Gebucht von" || '|' || "Gebucht am" || '|' || "Name" || '|' || "Kennung" || '|' || "Projekt-PSP-ID" || '|' || "Projektname" || '|' || "PSP-ID" || '|' || "PSP-Name" || '|' || "ProMaTo-Projekt-Nr." || '|' || "ProMaTo-Projekt" || '|' || "Kategorie" || '|' || "Gruppenname" || '|' || "Jira-Epic" || '|' || "Aufgaben-Nr." || '|' || "Aufgabenname" || '|' || "Leistungsart" || '|' || "Leistungsart (Bezeichnung)" || '|' || "Zeit [h]" || '|' || "Kommentar" ) AS row_hash FROM base_data """.format(timestamp=import_timestamp)) # Create the working_times table if it doesn't exist con.execute(""" CREATE TABLE IF NOT EXISTS working_times AS SELECT * FROM temp_working_times LIMIT 0 """) # Add column for delete flag if it doesn't exist con.execute(""" ALTER TABLE working_times ADD COLUMN IF NOT EXISTS delete BOOLEAN DEFAULT NULL """) # Get all unique dates from the new data con.execute(""" CREATE TEMP TABLE temp_import_dates AS SELECT DISTINCT "Datum" FROM temp_working_times """) # Get all unique dates from existing data con.execute(""" CREATE TEMP TABLE temp_existing_dates AS SELECT DISTINCT "Datum" FROM working_times """) # Find dates that are new (not in existing data) con.execute(""" CREATE TEMP TABLE temp_new_dates AS SELECT t."Datum" FROM temp_import_dates t LEFT JOIN temp_existing_dates e ON t."Datum" = e."Datum" WHERE e."Datum" IS NULL """) # Find dates that exist in both tables and might have changes con.execute(""" CREATE TEMP TABLE temp_changed_dates AS SELECT DISTINCT t."Datum" FROM temp_working_times t JOIN working_times e ON t."Datum" = e."Datum" WHERE NOT EXISTS ( SELECT 1 FROM working_times WHERE "Datum" = t."Datum" AND row_hash = t.row_hash ) """) # Insert new data for new and changed dates con.execute(""" INSERT INTO working_times SELECT *, NULL as delete FROM temp_working_times WHERE "Datum" IN ( SELECT "Datum" FROM temp_new_dates UNION SELECT "Datum" FROM temp_changed_dates ) """) # Verify the data was imported count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0] print(f"Successfully imported data. Total records in database: {count}") # Show the table schema print("\nTable Schema:") schema = con.execute("DESCRIBE working_times").fetchall() for col in schema: print(f"{col[0]}: {col[1]}") # Close the connection con.close() print("\nData import complete. Database saved to working_times.db")