promato_analyse/transform_data.py
2025-04-18 22:34:10 +02:00

118 lines
4.2 KiB
Python

import duckdb
import pandas as pd
import os
import time
import datetime
# Try to connect to the database with retry logic
max_retries = 5
retry_count = 0
connected = False
print("Trying to connect to working_times.db...")
while not connected and retry_count < max_retries:
try:
# Try to connect with 'access_mode=read_only' to avoid lock conflicts
con = duckdb.connect('working_times.db')
connected = True
print("Connected to working_times.db")
except Exception as e:
retry_count += 1
print(f"Connection attempt {retry_count} failed: {e}")
if retry_count < max_retries:
print(f"Retrying in {retry_count} seconds...")
time.sleep(retry_count)
else:
print("Maximum retries reached. Exiting.")
exit(1)
print("Transforming data...")
# Get the transformation timestamp
transform_timestamp = datetime.datetime.now()
print(f"Transform timestamp: {transform_timestamp}")
# Create a new table with transformed data
# This query will:
# 1. Extract date and project information
# 2. Calculate total hours per project per day
# 3. Format the data in a more analytical friendly way
con.execute("""
DROP TABLE IF EXISTS working_times_summary;
CREATE TABLE working_times_summary AS
SELECT
Datum AS date,
Projektname AS project_name,
"Leistungsart (Bezeichnung)" AS activity_type,
SUM("Zeit [h]") AS total_hours,
SUM("Zeit [h]"/8) AS total_days,
'{transform_timestamp}' AS transform_timestamp
FROM working_times
GROUP BY date, project_name, activity_type
ORDER BY date, project_name, activity_type;
""".format(transform_timestamp=transform_timestamp))
# Create a table with project totals
con.execute("""
DROP TABLE IF EXISTS project_summary;
CREATE TABLE project_summary AS
SELECT
Projektname AS project_name,
SUM("Zeit [h]") AS total_hours,
SUM("Zeit [h]"/8) AS total_days,
COUNT(DISTINCT Datum) AS days_worked,
MAX(import_timestamp) AS source_import_timestamp,
'{transform_timestamp}' AS transform_timestamp
FROM working_times
GROUP BY project_name
ORDER BY total_hours DESC;
""".format(transform_timestamp=transform_timestamp))
# Create a table with daily totals
con.execute("""
DROP TABLE IF EXISTS daily_summary;
CREATE TABLE daily_summary AS
SELECT
Datum AS date,
SUM("Zeit [h]") AS total_hours,
COUNT(*) AS entry_count,
COUNT(DISTINCT Projektname) AS project_count,
MAX(import_timestamp) AS source_import_timestamp,
'{transform_timestamp}' AS transform_timestamp
FROM working_times
GROUP BY date
ORDER BY date;
""".format(transform_timestamp=transform_timestamp))
# Verify the data was transformed
summary_count = con.execute("SELECT COUNT(*) FROM working_times_summary").fetchone()[0]
project_count = con.execute("SELECT COUNT(*) FROM project_summary").fetchone()[0]
daily_count = con.execute("SELECT COUNT(*) FROM daily_summary").fetchone()[0]
print(f"Successfully created {summary_count} records in working_times_summary table.")
print(f"Successfully created {project_count} records in project_summary table.")
print(f"Successfully created {daily_count} records in daily_summary table.")
# Print a sample of the summary table
print("\nSample of working_times_summary table:")
summary_sample = con.execute("SELECT date, project_name, activity_type, total_hours, total_days, transform_timestamp FROM working_times_summary LIMIT 5").fetchall()
for row in summary_sample:
print(row)
# Print a sample of the project summary table
print("\nProject summary (top 5 by hours):")
project_sample = con.execute("SELECT project_name, total_hours, total_days, days_worked, transform_timestamp FROM project_summary LIMIT 5").fetchall()
for row in project_sample:
print(row)
# Total hours worked
total_hours = con.execute("SELECT SUM(total_hours) FROM project_summary").fetchone()[0]
total_days = con.execute("SELECT SUM(total_days) FROM project_summary").fetchone()[0]
print(f"\nTotal hours worked: {total_hours:.2f}")
print(f"Total days worked: {total_days:.2f}")
# Close the connection
con.close()
print("\nData transformation complete.")