commit bee6508f4c5bcc06281d878425e2428401f87a2a Author: lasse Date: Fri Apr 18 22:34:10 2025 +0200 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..264a49e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +./venv +./data/* \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e15bae2 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +# Working Time Analysis + +This project analyzes working time data from a CSV file by: +1. Importing the data into a DuckDB database +2. Transforming the data for analysis +3. Generating reports based on the data + +## Setup + +### Dependencies + +Install the required dependencies: + +```bash +pip install -r requirements.txt +``` + +## Usage + +### 1. Import Data + +Run the import script to load the CSV data into DuckDB: + +```bash +python3 import_data.py +``` + +This will: +- Create a DuckDB database file `working_times.db` +- Import the CSV data into a table +- Add an import timestamp to each record + +### 2. Transform Data + +Run the transformation script to create analytical views: + +```bash +python3 transform_data.py +``` + +This will: +- Create summary tables with aggregated data +- Convert hours to days (using 8 hours = 1 day conversion) +- Add transformation timestamps + +### 3. Analyze Data + +Run the analysis script to generate reports: + +```bash +python3 analyze_data.py +``` + +This will produce: +- Overall time summary +- Top projects by hours +- Busiest days +- Day distribution analysis +- Project-activity combinations + +## Data Structure + +The analysis uses the following tables: + +- `working_times`: Raw imported data +- `working_times_summary`: Per-day, per-project aggregation +- `project_summary`: Total time per project +- `daily_summary`: Total time per day + +Each derived table includes timestamps for data lineage tracking. \ No newline at end of file diff --git a/analyze_data.py b/analyze_data.py new file mode 100644 index 0000000..5bed412 --- /dev/null +++ b/analyze_data.py @@ -0,0 +1,157 @@ +import duckdb +import pandas as pd +from datetime import datetime + +# Connect to the database +try: + con = duckdb.connect('working_times.db') + print("Connected to working_times.db") +except Exception as e: + print(f"Error connecting to database: {e}") + exit(1) + +# Get the current analysis timestamp +analysis_timestamp = datetime.now() + +# Function to format hours +def format_hours(hours): + return f"{hours:.2f}h" + +# Function to format days +def format_days(days): + return f"{days:.2f}d" + +# Get the date range of the data +date_range = con.execute(""" + SELECT MIN(date) AS start_date, MAX(date) AS end_date + FROM daily_summary +""").fetchone() + +start_date = date_range[0] +end_date = date_range[1] + +# Get the transformation timestamp (most recent) +transform_info = con.execute(""" + SELECT + MAX(transform_timestamp) AS transform_timestamp, + MAX(source_import_timestamp) AS source_import_timestamp + FROM daily_summary +""").fetchone() + +transform_timestamp = transform_info[0] +source_import_timestamp = transform_info[1] + +print("\n" + "="*60) +print(f"WORKING TIME ANALYSIS: {start_date} to {end_date}") +print(f"ANALYSIS TIMESTAMP: {analysis_timestamp}") +print(f"DATA TRANSFORMATION: {transform_timestamp}") +print(f"DATA IMPORT: {source_import_timestamp}") +print("="*60) + +# Get the total hours and days worked +totals = con.execute(""" + SELECT SUM(total_hours) AS total_hours, SUM(total_days) AS total_days + FROM project_summary +""").fetchone() + +total_hours = totals[0] +total_days = totals[1] +num_working_days = con.execute("SELECT COUNT(*) FROM daily_summary WHERE total_hours > 0").fetchone()[0] +avg_hours_per_day = total_hours / num_working_days if num_working_days > 0 else 0 + +print(f"\nTOTAL HOURS: {format_hours(total_hours)}") +print(f"TOTAL DAYS: {format_days(total_days)}") +print(f"WORKING DAYS: {num_working_days}") +print(f"AVG HOURS PER WORKING DAY: {format_hours(avg_hours_per_day)}") + +# Get the top projects by hours +top_projects = con.execute(""" + SELECT project_name, total_hours, total_days, days_worked + FROM project_summary + ORDER BY total_hours DESC + LIMIT 5 +""").fetchall() + +print("\n" + "-"*60) +print("TOP 5 PROJECTS BY HOURS") +print("-"*60) +for i, (project, hours, days, worked_days) in enumerate(top_projects, 1): + percent = (hours / total_hours) * 100 + print(f"{i}. {project}") + print(f" {format_hours(hours)} ({percent:.1f}% of total) / {format_days(days)}") + print(f" Across {worked_days} days, daily average: {format_hours(hours/worked_days) if worked_days > 0 else 0}") + +# Get the busiest days +busiest_days = con.execute(""" + SELECT date, total_hours, project_count + FROM daily_summary + WHERE total_hours > 0 + ORDER BY total_hours DESC + LIMIT 5 +""").fetchall() + +print("\n" + "-"*60) +print("TOP 5 BUSIEST DAYS") +print("-"*60) +for i, (date, hours, project_count) in enumerate(busiest_days, 1): + # Calculate day equivalent + day_equivalent = hours / 8 + print(f"{i}. {date}: {format_hours(hours)} ({format_days(day_equivalent)}) across {project_count} projects") + +# Get day distribution +day_distribution = con.execute(""" + SELECT + CASE + WHEN total_hours <= 4 THEN '0-4 hours' + WHEN total_hours <= 6 THEN '4-6 hours' + WHEN total_hours <= 8 THEN '6-8 hours' + WHEN total_hours <= 10 THEN '8-10 hours' + ELSE '10+ hours' + END AS hour_range, + COUNT(*) as day_count + FROM daily_summary + WHERE total_hours > 0 + GROUP BY hour_range + ORDER BY + CASE + WHEN hour_range = '0-4 hours' THEN 1 + WHEN hour_range = '4-6 hours' THEN 2 + WHEN hour_range = '6-8 hours' THEN 3 + WHEN hour_range = '8-10 hours' THEN 4 + ELSE 5 + END +""").fetchall() + +print("\n" + "-"*60) +print("DAY DISTRIBUTION") +print("-"*60) +for hour_range, day_count in day_distribution: + percent = (day_count / num_working_days) * 100 + print(f"{hour_range}: {day_count} days ({percent:.1f}%)") + +# Print an overview of project/activity combinations +project_activity_combo = con.execute(""" + SELECT + project_name, + activity_type, + SUM(total_hours) as hours, + SUM(total_days) as days + FROM working_times_summary + GROUP BY project_name, activity_type + ORDER BY hours DESC + LIMIT 10 +""").fetchall() + +print("\n" + "-"*60) +print("TOP 10 PROJECT-ACTIVITY COMBINATIONS") +print("-"*60) +for project, activity, hours, days in project_activity_combo: + percent = (hours / total_hours) * 100 + print(f"{project} - {activity}: {format_hours(hours)} ({format_days(days)}, {percent:.1f}%)") + +print("\n" + "="*60) +print(f"END OF ANALYSIS - Generated at {analysis_timestamp}") +print("="*60) + +# Close the connection +con.close() \ No newline at end of file diff --git a/import_data.py b/import_data.py new file mode 100644 index 0000000..762d228 --- /dev/null +++ b/import_data.py @@ -0,0 +1,65 @@ +import duckdb +import pandas as pd +import os +import datetime + +# Create connection to DuckDB +con = duckdb.connect('working_times.db') + +# Path to the CSV file +csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv' + +# Check if file exists +if not os.path.exists(csv_file): + print(f"Error: File {csv_file} not found") + exit(1) + +print(f"Importing data from {csv_file}...") + +# Current timestamp for the import +import_timestamp = datetime.datetime.now() +print(f"Import timestamp: {import_timestamp}") + +# First, create a temporary table with the CSV data +con.execute(""" + CREATE TABLE IF NOT EXISTS temp_working_times AS + SELECT * FROM read_csv_auto( + '{csv_file}', + delim=';', + header=true, + ignore_errors=true, + sample_size=1000, + auto_detect=true, + decimal_separator=',' + ) +""".format(csv_file=csv_file)) + +# Drop the existing table if it exists +con.execute("DROP TABLE IF EXISTS working_times") + +# Now create the final table with the timestamp column +con.execute(""" + CREATE TABLE working_times AS + SELECT + *, + '{timestamp}' AS import_timestamp + FROM temp_working_times +""".format(timestamp=import_timestamp)) + +# Drop the temporary table +con.execute("DROP TABLE IF EXISTS temp_working_times") + +# Verify the data was imported +count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0] +print(f"Successfully imported {count} records into the working_times table.") + +# Show the table schema +print("\nTable Schema:") +schema = con.execute("DESCRIBE working_times").fetchall() +for col in schema: + print(f"{col[0]}: {col[1]}") + +# Close the connection +con.close() + +print("\nData import complete. Database saved to working_times.db") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b5a002a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +duckdb==1.2.2 +pandas>=2.2.0 +python-dateutil>=2.8.2 +pytz>=2025.1 +numpy>=1.22.4 \ No newline at end of file diff --git a/run_analysis.py b/run_analysis.py new file mode 100644 index 0000000..f0a9865 --- /dev/null +++ b/run_analysis.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +""" +Working Time Analysis - Complete Workflow +This script runs all three steps of the analysis in sequence: +1. Import the data +2. Transform the data +3. Generate analysis reports +""" + +import os +import sys +import subprocess +import time + +def run_step(script_name, step_desc): + """Run a step in the analysis and handle errors""" + print(f"\n{'='*60}") + print(f"STEP: {step_desc}") + print(f"{'='*60}") + + try: + # Run the script and capture output + result = subprocess.run( + [sys.executable, script_name], + capture_output=True, + text=True, + check=True + ) + print(result.stdout) + return True + except subprocess.CalledProcessError as e: + print(f"ERROR in {script_name}:") + print(e.stderr) + return False + +def main(): + # Check if the CSV file exists + csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv' + if not os.path.exists(csv_file): + print(f"Error: CSV file not found at {csv_file}") + return False + + # Step 1: Import Data + if not run_step('import_data.py', 'IMPORTING DATA'): + return False + + # Wait a moment to ensure any file locks are released + time.sleep(1) + + # Step 2: Transform Data + if not run_step('transform_data.py', 'TRANSFORMING DATA'): + return False + + # Wait a moment to ensure any file locks are released + time.sleep(1) + + # Step 3: Analyze Data + if not run_step('analyze_data.py', 'ANALYZING DATA'): + return False + + print("\n" + "="*60) + print("ANALYSIS COMPLETE") + print("="*60) + return True + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) \ No newline at end of file diff --git a/transform_data.py b/transform_data.py new file mode 100644 index 0000000..7883204 --- /dev/null +++ b/transform_data.py @@ -0,0 +1,118 @@ +import duckdb +import pandas as pd +import os +import time +import datetime + +# Try to connect to the database with retry logic +max_retries = 5 +retry_count = 0 +connected = False + +print("Trying to connect to working_times.db...") + +while not connected and retry_count < max_retries: + try: + # Try to connect with 'access_mode=read_only' to avoid lock conflicts + con = duckdb.connect('working_times.db') + connected = True + print("Connected to working_times.db") + except Exception as e: + retry_count += 1 + print(f"Connection attempt {retry_count} failed: {e}") + if retry_count < max_retries: + print(f"Retrying in {retry_count} seconds...") + time.sleep(retry_count) + else: + print("Maximum retries reached. Exiting.") + exit(1) + +print("Transforming data...") + +# Get the transformation timestamp +transform_timestamp = datetime.datetime.now() +print(f"Transform timestamp: {transform_timestamp}") + +# Create a new table with transformed data +# This query will: +# 1. Extract date and project information +# 2. Calculate total hours per project per day +# 3. Format the data in a more analytical friendly way +con.execute(""" + DROP TABLE IF EXISTS working_times_summary; + CREATE TABLE working_times_summary AS + SELECT + Datum AS date, + Projektname AS project_name, + "Leistungsart (Bezeichnung)" AS activity_type, + SUM("Zeit [h]") AS total_hours, + SUM("Zeit [h]"/8) AS total_days, + '{transform_timestamp}' AS transform_timestamp + FROM working_times + GROUP BY date, project_name, activity_type + ORDER BY date, project_name, activity_type; +""".format(transform_timestamp=transform_timestamp)) + +# Create a table with project totals +con.execute(""" + DROP TABLE IF EXISTS project_summary; + CREATE TABLE project_summary AS + SELECT + Projektname AS project_name, + SUM("Zeit [h]") AS total_hours, + SUM("Zeit [h]"/8) AS total_days, + COUNT(DISTINCT Datum) AS days_worked, + MAX(import_timestamp) AS source_import_timestamp, + '{transform_timestamp}' AS transform_timestamp + FROM working_times + GROUP BY project_name + ORDER BY total_hours DESC; +""".format(transform_timestamp=transform_timestamp)) + +# Create a table with daily totals +con.execute(""" + DROP TABLE IF EXISTS daily_summary; + CREATE TABLE daily_summary AS + SELECT + Datum AS date, + SUM("Zeit [h]") AS total_hours, + COUNT(*) AS entry_count, + COUNT(DISTINCT Projektname) AS project_count, + MAX(import_timestamp) AS source_import_timestamp, + '{transform_timestamp}' AS transform_timestamp + FROM working_times + GROUP BY date + ORDER BY date; +""".format(transform_timestamp=transform_timestamp)) + +# Verify the data was transformed +summary_count = con.execute("SELECT COUNT(*) FROM working_times_summary").fetchone()[0] +project_count = con.execute("SELECT COUNT(*) FROM project_summary").fetchone()[0] +daily_count = con.execute("SELECT COUNT(*) FROM daily_summary").fetchone()[0] + +print(f"Successfully created {summary_count} records in working_times_summary table.") +print(f"Successfully created {project_count} records in project_summary table.") +print(f"Successfully created {daily_count} records in daily_summary table.") + +# Print a sample of the summary table +print("\nSample of working_times_summary table:") +summary_sample = con.execute("SELECT date, project_name, activity_type, total_hours, total_days, transform_timestamp FROM working_times_summary LIMIT 5").fetchall() +for row in summary_sample: + print(row) + +# Print a sample of the project summary table +print("\nProject summary (top 5 by hours):") +project_sample = con.execute("SELECT project_name, total_hours, total_days, days_worked, transform_timestamp FROM project_summary LIMIT 5").fetchall() +for row in project_sample: + print(row) + +# Total hours worked +total_hours = con.execute("SELECT SUM(total_hours) FROM project_summary").fetchone()[0] +total_days = con.execute("SELECT SUM(total_days) FROM project_summary").fetchone()[0] +print(f"\nTotal hours worked: {total_hours:.2f}") +print(f"Total days worked: {total_days:.2f}") + +# Close the connection +con.close() + +print("\nData transformation complete.") \ No newline at end of file diff --git a/working_times.db b/working_times.db new file mode 100644 index 0000000..4db2a6f Binary files /dev/null and b/working_times.db differ