dataframe for encoding

This commit is contained in:
lasse 2025-04-19 00:33:02 +02:00
parent cc6ff9325d
commit a048b724ea

View File

@ -20,22 +20,18 @@ print(f"Importing data from {csv_file}...")
import_timestamp = datetime.datetime.now()
print(f"Import timestamp: {import_timestamp}")
# First read the CSV file with pandas to handle encoding
print("Reading CSV file with proper encoding...")
df = pd.read_csv(csv_file, sep=';', encoding='ISO-8859-1', decimal=',')
# Create temporary table with CSV data, import timestamp, and hash
con.execute("""
CREATE TEMP TABLE IF NOT EXISTS temp_working_times AS
CREATE TEMP TABLE temp_working_times AS
WITH base_data AS (
SELECT
*,
'{timestamp}'::TIMESTAMP AS import_timestamp
FROM read_csv_auto(
'{csv_file}',
delim=';',
header=true,
ignore_errors=true,
sample_size=1000,
auto_detect=true,
decimal_separator=','
)
FROM df
)
SELECT
*,
@ -65,7 +61,7 @@ con.execute("""
"Kommentar"
) AS row_hash
FROM base_data
""".format(csv_file=csv_file, timestamp=import_timestamp))
""".format(timestamp=import_timestamp))
# Create the working_times table if it doesn't exist
con.execute("""
@ -85,7 +81,7 @@ con.execute("""
SELECT DISTINCT "Datum" FROM temp_working_times
""")
# Get all unique dates from existing data, ignore deleted entries
# Get all unique dates from existing data
con.execute("""
CREATE TEMP TABLE temp_existing_dates AS
SELECT DISTINCT "Datum" FROM working_times
@ -107,16 +103,17 @@ con.execute("""
FROM temp_working_times t
JOIN working_times e ON t."Datum" = e."Datum"
WHERE NOT EXISTS (
SELECT 1 FROM working_times e2
WHERE e2."Datum" = t."Datum"
AND e2.row_hash = t.row_hash
SELECT 1
FROM working_times
WHERE "Datum" = t."Datum"
AND row_hash = t.row_hash
)
""")
# Insert new data for new and changed dates
con.execute("""
INSERT INTO working_times
SELECT * FROM temp_working_times
SELECT *, NULL as delete FROM temp_working_times
WHERE "Datum" IN (
SELECT "Datum" FROM temp_new_dates
UNION
@ -124,9 +121,6 @@ con.execute("""
)
""")
# Set delete flag
# TODO
# Verify the data was imported
count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0]
print(f"Successfully imported data. Total records in database: {count}")