137 lines
3.8 KiB
Python
137 lines
3.8 KiB
Python
import duckdb
|
|
import pandas as pd
|
|
import os
|
|
import datetime
|
|
|
|
# Create connection to DuckDB
|
|
con = duckdb.connect('working_times.db')
|
|
|
|
# Path to the CSV file
|
|
csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'
|
|
|
|
# Check if file exists
|
|
if not os.path.exists(csv_file):
|
|
print(f"Error: File {csv_file} not found")
|
|
exit(1)
|
|
|
|
print(f"Importing data from {csv_file}...")
|
|
|
|
# Current timestamp for the import
|
|
import_timestamp = datetime.datetime.now()
|
|
print(f"Import timestamp: {import_timestamp}")
|
|
|
|
# First read the CSV file with pandas to handle encoding
|
|
print("Reading CSV file with proper encoding...")
|
|
df = pd.read_csv(csv_file, sep=';', encoding='ISO-8859-1', decimal=',')
|
|
|
|
# Create temporary table with CSV data, import timestamp, and hash
|
|
con.execute("""
|
|
CREATE TEMP TABLE temp_working_times AS
|
|
WITH base_data AS (
|
|
SELECT
|
|
*,
|
|
'{timestamp}'::TIMESTAMP AS import_timestamp
|
|
FROM df
|
|
)
|
|
SELECT
|
|
*,
|
|
md5(
|
|
"Datum" || '|' ||
|
|
"Von" || '|' ||
|
|
"Bis" || '|' ||
|
|
"Pause" || '|' ||
|
|
"Gebucht von" || '|' ||
|
|
"Gebucht am" || '|' ||
|
|
"Name" || '|' ||
|
|
"Kennung" || '|' ||
|
|
"Projekt-PSP-ID" || '|' ||
|
|
"Projektname" || '|' ||
|
|
"PSP-ID" || '|' ||
|
|
"PSP-Name" || '|' ||
|
|
"ProMaTo-Projekt-Nr." || '|' ||
|
|
"ProMaTo-Projekt" || '|' ||
|
|
"Kategorie" || '|' ||
|
|
"Gruppenname" || '|' ||
|
|
"Jira-Epic" || '|' ||
|
|
"Aufgaben-Nr." || '|' ||
|
|
"Aufgabenname" || '|' ||
|
|
"Leistungsart" || '|' ||
|
|
"Leistungsart (Bezeichnung)" || '|' ||
|
|
"Zeit [h]" || '|' ||
|
|
"Kommentar"
|
|
) AS row_hash
|
|
FROM base_data
|
|
""".format(timestamp=import_timestamp))
|
|
|
|
# Create the working_times table if it doesn't exist
|
|
con.execute("""
|
|
CREATE TABLE IF NOT EXISTS working_times AS
|
|
SELECT * FROM temp_working_times LIMIT 0
|
|
""")
|
|
|
|
# Add column for delete flag if it doesn't exist
|
|
con.execute("""
|
|
ALTER TABLE working_times
|
|
ADD COLUMN IF NOT EXISTS delete BOOLEAN DEFAULT NULL
|
|
""")
|
|
|
|
# Get all unique dates from the new data
|
|
con.execute("""
|
|
CREATE TEMP TABLE temp_import_dates AS
|
|
SELECT DISTINCT "Datum" FROM temp_working_times
|
|
""")
|
|
|
|
# Get all unique dates from existing data
|
|
con.execute("""
|
|
CREATE TEMP TABLE temp_existing_dates AS
|
|
SELECT DISTINCT "Datum" FROM working_times
|
|
""")
|
|
|
|
# Find dates that are new (not in existing data)
|
|
con.execute("""
|
|
CREATE TEMP TABLE temp_new_dates AS
|
|
SELECT t."Datum"
|
|
FROM temp_import_dates t
|
|
LEFT JOIN temp_existing_dates e ON t."Datum" = e."Datum"
|
|
WHERE e."Datum" IS NULL
|
|
""")
|
|
|
|
# Find dates that exist in both tables and might have changes
|
|
con.execute("""
|
|
CREATE TEMP TABLE temp_changed_dates AS
|
|
SELECT DISTINCT t."Datum"
|
|
FROM temp_working_times t
|
|
JOIN working_times e ON t."Datum" = e."Datum"
|
|
WHERE NOT EXISTS (
|
|
SELECT 1
|
|
FROM working_times
|
|
WHERE "Datum" = t."Datum"
|
|
AND row_hash = t.row_hash
|
|
)
|
|
""")
|
|
|
|
# Insert new data for new and changed dates
|
|
con.execute("""
|
|
INSERT INTO working_times
|
|
SELECT *, NULL as delete FROM temp_working_times
|
|
WHERE "Datum" IN (
|
|
SELECT "Datum" FROM temp_new_dates
|
|
UNION
|
|
SELECT "Datum" FROM temp_changed_dates
|
|
)
|
|
""")
|
|
|
|
# Verify the data was imported
|
|
count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0]
|
|
print(f"Successfully imported data. Total records in database: {count}")
|
|
|
|
# Show the table schema
|
|
print("\nTable Schema:")
|
|
schema = con.execute("DESCRIBE working_times").fetchall()
|
|
for col in schema:
|
|
print(f"{col[0]}: {col[1]}")
|
|
|
|
# Close the connection
|
|
con.close()
|
|
|
|
print("\nData import complete. Database saved to working_times.db") |