promato_analyse/import_data.py
2025-04-18 22:34:10 +02:00

65 lines
1.7 KiB
Python

import duckdb
import pandas as pd
import os
import datetime
# Create connection to DuckDB
con = duckdb.connect('working_times.db')
# Path to the CSV file
csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'
# Check if file exists
if not os.path.exists(csv_file):
print(f"Error: File {csv_file} not found")
exit(1)
print(f"Importing data from {csv_file}...")
# Current timestamp for the import
import_timestamp = datetime.datetime.now()
print(f"Import timestamp: {import_timestamp}")
# First, create a temporary table with the CSV data
con.execute("""
CREATE TABLE IF NOT EXISTS temp_working_times AS
SELECT * FROM read_csv_auto(
'{csv_file}',
delim=';',
header=true,
ignore_errors=true,
sample_size=1000,
auto_detect=true,
decimal_separator=','
)
""".format(csv_file=csv_file))
# Drop the existing table if it exists
con.execute("DROP TABLE IF EXISTS working_times")
# Now create the final table with the timestamp column
con.execute("""
CREATE TABLE working_times AS
SELECT
*,
'{timestamp}' AS import_timestamp
FROM temp_working_times
""".format(timestamp=import_timestamp))
# Drop the temporary table
con.execute("DROP TABLE IF EXISTS temp_working_times")
# Verify the data was imported
count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0]
print(f"Successfully imported {count} records into the working_times table.")
# Show the table schema
print("\nTable Schema:")
schema = con.execute("DESCRIBE working_times").fetchall()
for col in schema:
print(f"{col[0]}: {col[1]}")
# Close the connection
con.close()
print("\nData import complete. Database saved to working_times.db")