dataframe for encoding

This commit is contained in:
lasse 2025-04-19 00:33:02 +02:00
parent cc6ff9325d
commit a048b724ea

View File

@ -20,22 +20,18 @@ print(f"Importing data from {csv_file}...")
import_timestamp = datetime.datetime.now() import_timestamp = datetime.datetime.now()
print(f"Import timestamp: {import_timestamp}") print(f"Import timestamp: {import_timestamp}")
# First read the CSV file with pandas to handle encoding
print("Reading CSV file with proper encoding...")
df = pd.read_csv(csv_file, sep=';', encoding='ISO-8859-1', decimal=',')
# Create temporary table with CSV data, import timestamp, and hash # Create temporary table with CSV data, import timestamp, and hash
con.execute(""" con.execute("""
CREATE TEMP TABLE IF NOT EXISTS temp_working_times AS CREATE TEMP TABLE temp_working_times AS
WITH base_data AS ( WITH base_data AS (
SELECT SELECT
*, *,
'{timestamp}'::TIMESTAMP AS import_timestamp '{timestamp}'::TIMESTAMP AS import_timestamp
FROM read_csv_auto( FROM df
'{csv_file}',
delim=';',
header=true,
ignore_errors=true,
sample_size=1000,
auto_detect=true,
decimal_separator=','
)
) )
SELECT SELECT
*, *,
@ -65,7 +61,7 @@ con.execute("""
"Kommentar" "Kommentar"
) AS row_hash ) AS row_hash
FROM base_data FROM base_data
""".format(csv_file=csv_file, timestamp=import_timestamp)) """.format(timestamp=import_timestamp))
# Create the working_times table if it doesn't exist # Create the working_times table if it doesn't exist
con.execute(""" con.execute("""
@ -85,7 +81,7 @@ con.execute("""
SELECT DISTINCT "Datum" FROM temp_working_times SELECT DISTINCT "Datum" FROM temp_working_times
""") """)
# Get all unique dates from existing data, ignore deleted entries # Get all unique dates from existing data
con.execute(""" con.execute("""
CREATE TEMP TABLE temp_existing_dates AS CREATE TEMP TABLE temp_existing_dates AS
SELECT DISTINCT "Datum" FROM working_times SELECT DISTINCT "Datum" FROM working_times
@ -107,16 +103,17 @@ con.execute("""
FROM temp_working_times t FROM temp_working_times t
JOIN working_times e ON t."Datum" = e."Datum" JOIN working_times e ON t."Datum" = e."Datum"
WHERE NOT EXISTS ( WHERE NOT EXISTS (
SELECT 1 FROM working_times e2 SELECT 1
WHERE e2."Datum" = t."Datum" FROM working_times
AND e2.row_hash = t.row_hash WHERE "Datum" = t."Datum"
AND row_hash = t.row_hash
) )
""") """)
# Insert new data for new and changed dates # Insert new data for new and changed dates
con.execute(""" con.execute("""
INSERT INTO working_times INSERT INTO working_times
SELECT * FROM temp_working_times SELECT *, NULL as delete FROM temp_working_times
WHERE "Datum" IN ( WHERE "Datum" IN (
SELECT "Datum" FROM temp_new_dates SELECT "Datum" FROM temp_new_dates
UNION UNION
@ -124,9 +121,6 @@ con.execute("""
) )
""") """)
# Set delete flag
# TODO
# Verify the data was imported # Verify the data was imported
count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0] count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0]
print(f"Successfully imported data. Total records in database: {count}") print(f"Successfully imported data. Total records in database: {count}")