promato_analyse/import_data.py

import duckdb
import pandas as pd
import os
import datetime

# Create connection to DuckDB
con = duckdb.connect('working_times.db')

# Path to the CSV file
csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'

# Check if file exists
if not os.path.exists(csv_file):
    print(f"Error: File {csv_file} not found")
    exit(1)

print(f"Importing data from {csv_file}...")

# Current timestamp for the import
import_timestamp = datetime.datetime.now()
print(f"Import timestamp: {import_timestamp}")

# First read the CSV file with pandas to handle encoding
print("Reading CSV file with proper encoding...")
df = pd.read_csv(csv_file, sep=';', encoding='ISO-8859-1', decimal=',')

# Create temporary table with CSV data, import timestamp, and hash
con.execute("""
    CREATE TEMP TABLE temp_working_times AS
    WITH base_data AS (
        SELECT
            *,
            '{timestamp}'::TIMESTAMP AS import_timestamp
        FROM df
    )
    SELECT
        *,
        md5(
            "Datum" || '|' ||
            "Von" || '|' ||
            "Bis" || '|' ||
            "Pause" || '|' ||
            "Gebucht von" || '|' ||
            "Gebucht am" || '|' ||
            "Name" || '|' ||
            "Kennung" || '|' ||
            "Projekt-PSP-ID" || '|' ||
            "Projektname" || '|' ||
            "PSP-ID" || '|' ||
            "PSP-Name" || '|' ||
            "ProMaTo-Projekt-Nr." || '|' ||
            "ProMaTo-Projekt" || '|' ||
            "Kategorie" || '|' ||
            "Gruppenname" || '|' ||
            "Jira-Epic" || '|' ||
            "Aufgaben-Nr." || '|' ||
            "Aufgabenname" || '|' ||
            "Leistungsart" || '|' ||
            "Leistungsart (Bezeichnung)" || '|' ||
            "Zeit [h]" || '|' ||
            "Kommentar"
        ) AS row_hash
    FROM base_data
""".format(timestamp=import_timestamp))

# Create the working_times table if it doesn't exist
con.execute("""
    CREATE TABLE IF NOT EXISTS working_times AS
    SELECT * FROM temp_working_times LIMIT 0
""")

# Add column for delete flag if it doesn't exist
con.execute("""
    ALTER TABLE working_times
    ADD COLUMN IF NOT EXISTS delete BOOLEAN DEFAULT NULL
""")

# Get all unique dates from the new data
con.execute("""
    CREATE TEMP TABLE temp_import_dates AS
    SELECT DISTINCT "Datum" FROM temp_working_times
""")

# Get all unique dates from existing data
con.execute("""
    CREATE TEMP TABLE temp_existing_dates AS
    SELECT DISTINCT "Datum" FROM working_times
""")

# Find dates that are new (not in existing data)
con.execute("""
    CREATE TEMP TABLE temp_new_dates AS
    SELECT t."Datum"
    FROM temp_import_dates t
    LEFT JOIN temp_existing_dates e ON t."Datum" = e."Datum"
    WHERE e."Datum" IS NULL
""")

# Find dates that exist in both tables and might have changes
con.execute("""
    CREATE TEMP TABLE temp_changed_dates AS
    SELECT DISTINCT t."Datum"
    FROM temp_working_times t
    JOIN working_times e ON t."Datum" = e."Datum"
    WHERE NOT EXISTS (
        SELECT 1
        FROM working_times
        WHERE "Datum" = t."Datum"
        AND row_hash = t.row_hash
    )
""")

# Insert new data for new and changed dates
con.execute("""
    INSERT INTO working_times
    SELECT *, NULL as delete FROM temp_working_times
    WHERE "Datum" IN (
        SELECT "Datum" FROM temp_new_dates
        UNION
        SELECT "Datum" FROM temp_changed_dates
    )
""")

# Verify the data was imported
count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0]
print(f"Successfully imported data. Total records in database: {count}")

# Show the table schema
print("\nTable Schema:")
schema = con.execute("DESCRIBE working_times").fetchall()
for col in schema:
    print(f"{col[0]}: {col[1]}")

# Close the connection
con.close()

print("\nData import complete. Database saved to working_times.db")