promato_analyse/import_data.py

import duckdb
import pandas as pd
import os
import datetime

# Create connection to DuckDB
con = duckdb.connect('working_times.db')

# Path to the CSV file
csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'

# Check if file exists
if not os.path.exists(csv_file):
    print(f"Error: File {csv_file} not found")
    exit(1)

print(f"Importing data from {csv_file}...")

# Current timestamp for the import
import_timestamp = datetime.datetime.now()
print(f"Import timestamp: {import_timestamp}")

# First, create a temporary table with the CSV data
con.execute("""
    CREATE TABLE IF NOT EXISTS temp_working_times AS
    SELECT * FROM read_csv_auto(
        '{csv_file}',
        delim=';',
        header=true,
        ignore_errors=true,
        sample_size=1000,
        auto_detect=true,
        decimal_separator=','
    )
""".format(csv_file=csv_file))

# Drop the existing table if it exists
con.execute("DROP TABLE IF EXISTS working_times")

# Now create the final table with the timestamp column
con.execute("""
    CREATE TABLE working_times AS
    SELECT
        *,
        '{timestamp}' AS import_timestamp
    FROM temp_working_times
""".format(timestamp=import_timestamp))

# Drop the temporary table
con.execute("DROP TABLE IF EXISTS temp_working_times")

# Verify the data was imported
count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0]
print(f"Successfully imported {count} records into the working_times table.")

# Show the table schema
print("\nTable Schema:")
schema = con.execute("DESCRIBE working_times").fetchall()
for col in schema:
    print(f"{col[0]}: {col[1]}")

# Close the connection
con.close()

print("\nData import complete. Database saved to working_times.db")