first commit

2025-04-18 22:34:10 +02:00
commit bee6508f4c
8 changed files with 485 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 ./venv
 ./data/*
--- a/README.md
+++ b/README.md
@@ -0,0 +1,70 @@
 # Working Time Analysis
 This project analyzes working time data from a CSV file by:
 1. Importing the data into a DuckDB database
 2. Transforming the data for analysis
 3. Generating reports based on the data
 ## Setup
 ### Dependencies
 Install the required dependencies:
 ```bash
 pip install -r requirements.txt
 ```
 ## Usage
 ### 1. Import Data
 Run the import script to load the CSV data into DuckDB:
 ```bash
 python3 import_data.py
 ```
 This will:
 - Create a DuckDB database file `working_times.db`
 - Import the CSV data into a table
 - Add an import timestamp to each record
 ### 2. Transform Data
 Run the transformation script to create analytical views:
 ```bash
 python3 transform_data.py
 ```
 This will:
 - Create summary tables with aggregated data
 - Convert hours to days (using 8 hours = 1 day conversion)
 - Add transformation timestamps
 ### 3. Analyze Data
 Run the analysis script to generate reports:
 ```bash
 python3 analyze_data.py
 ```
 This will produce:
 - Overall time summary
 - Top projects by hours
 - Busiest days
 - Day distribution analysis
 - Project-activity combinations
 ## Data Structure
 The analysis uses the following tables:
 - `working_times`: Raw imported data
 - `working_times_summary`: Per-day, per-project aggregation
 - `project_summary`: Total time per project
 - `daily_summary`: Total time per day
 Each derived table includes timestamps for data lineage tracking. 
--- a/analyze_data.py
+++ b/analyze_data.py
@@ -0,0 +1,157 @@
 import duckdb
 import pandas as pd
 from datetime import datetime
 # Connect to the database
 try:
    con = duckdb.connect('working_times.db')
    print("Connected to working_times.db")
 except Exception as e:
    print(f"Error connecting to database: {e}")
    exit(1)
 # Get the current analysis timestamp
 analysis_timestamp = datetime.now()
 # Function to format hours
 def format_hours(hours):
    return f"{hours:.2f}h"
 # Function to format days
 def format_days(days):
    return f"{days:.2f}d"
 # Get the date range of the data
 date_range = con.execute("""
    SELECT MIN(date) AS start_date, MAX(date) AS end_date
    FROM daily_summary
 """).fetchone()
 start_date = date_range[0]
 end_date = date_range[1]
 # Get the transformation timestamp (most recent)
 transform_info = con.execute("""
    SELECT 
        MAX(transform_timestamp) AS transform_timestamp, 
        MAX(source_import_timestamp) AS source_import_timestamp
    FROM daily_summary
 """).fetchone()
 transform_timestamp = transform_info[0]
 source_import_timestamp = transform_info[1]
 print("\n" + "="*60)
 print(f"WORKING TIME ANALYSIS: {start_date} to {end_date}")
 print(f"ANALYSIS TIMESTAMP: {analysis_timestamp}")
 print(f"DATA TRANSFORMATION: {transform_timestamp}")
 print(f"DATA IMPORT: {source_import_timestamp}")
 print("="*60)
 # Get the total hours and days worked
 totals = con.execute("""
    SELECT SUM(total_hours) AS total_hours, SUM(total_days) AS total_days 
    FROM project_summary
 """).fetchone()
 total_hours = totals[0]
 total_days = totals[1]
 num_working_days = con.execute("SELECT COUNT(*) FROM daily_summary WHERE total_hours > 0").fetchone()[0]
 avg_hours_per_day = total_hours / num_working_days if num_working_days > 0 else 0
 print(f"\nTOTAL HOURS: {format_hours(total_hours)}")
 print(f"TOTAL DAYS: {format_days(total_days)}")
 print(f"WORKING DAYS: {num_working_days}")
 print(f"AVG HOURS PER WORKING DAY: {format_hours(avg_hours_per_day)}")
 # Get the top projects by hours
 top_projects = con.execute("""
    SELECT project_name, total_hours, total_days, days_worked
    FROM project_summary
    ORDER BY total_hours DESC
    LIMIT 5
 """).fetchall()
 print("\n" + "-"*60)
 print("TOP 5 PROJECTS BY HOURS")
 print("-"*60)
 for i, (project, hours, days, worked_days) in enumerate(top_projects, 1):
    percent = (hours / total_hours) * 100
    print(f"{i}. {project}")
    print(f"   {format_hours(hours)} ({percent:.1f}% of total) / {format_days(days)}")
    print(f"   Across {worked_days} days, daily average: {format_hours(hours/worked_days) if worked_days > 0 else 0}")
 # Get the busiest days
 busiest_days = con.execute("""
    SELECT date, total_hours, project_count
    FROM daily_summary
    WHERE total_hours > 0
    ORDER BY total_hours DESC
    LIMIT 5
 """).fetchall()
 print("\n" + "-"*60)
 print("TOP 5 BUSIEST DAYS")
 print("-"*60)
 for i, (date, hours, project_count) in enumerate(busiest_days, 1):
    # Calculate day equivalent
    day_equivalent = hours / 8  
    print(f"{i}. {date}: {format_hours(hours)} ({format_days(day_equivalent)}) across {project_count} projects")
 # Get day distribution
 day_distribution = con.execute("""
    SELECT 
        CASE 
            WHEN total_hours <= 4 THEN '0-4 hours'
            WHEN total_hours <= 6 THEN '4-6 hours'
            WHEN total_hours <= 8 THEN '6-8 hours'
            WHEN total_hours <= 10 THEN '8-10 hours'
            ELSE '10+ hours'
        END AS hour_range,
        COUNT(*) as day_count
    FROM daily_summary
    WHERE total_hours > 0
    GROUP BY hour_range
    ORDER BY 
        CASE 
            WHEN hour_range = '0-4 hours' THEN 1
            WHEN hour_range = '4-6 hours' THEN 2
            WHEN hour_range = '6-8 hours' THEN 3
            WHEN hour_range = '8-10 hours' THEN 4
            ELSE 5
        END
 """).fetchall()
 print("\n" + "-"*60)
 print("DAY DISTRIBUTION")
 print("-"*60)
 for hour_range, day_count in day_distribution:
    percent = (day_count / num_working_days) * 100
    print(f"{hour_range}: {day_count} days ({percent:.1f}%)")
 # Print an overview of project/activity combinations
 project_activity_combo = con.execute("""
    SELECT 
        project_name,
        activity_type,
        SUM(total_hours) as hours,
        SUM(total_days) as days
    FROM working_times_summary
    GROUP BY project_name, activity_type
    ORDER BY hours DESC
    LIMIT 10
 """).fetchall()
 print("\n" + "-"*60)
 print("TOP 10 PROJECT-ACTIVITY COMBINATIONS")
 print("-"*60)
 for project, activity, hours, days in project_activity_combo:
    percent = (hours / total_hours) * 100
    print(f"{project} - {activity}: {format_hours(hours)} ({format_days(days)}, {percent:.1f}%)")
 print("\n" + "="*60)
 print(f"END OF ANALYSIS - Generated at {analysis_timestamp}")
 print("="*60)
 # Close the connection
 con.close() 
--- a/import_data.py
+++ b/import_data.py
@@ -0,0 +1,65 @@
 import duckdb
 import pandas as pd
 import os
 import datetime
 # Create connection to DuckDB
 con = duckdb.connect('working_times.db')
 # Path to the CSV file
 csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'
 # Check if file exists
 if not os.path.exists(csv_file):
    print(f"Error: File {csv_file} not found")
    exit(1)
 print(f"Importing data from {csv_file}...")
 # Current timestamp for the import
 import_timestamp = datetime.datetime.now()
 print(f"Import timestamp: {import_timestamp}")
 # First, create a temporary table with the CSV data
 con.execute("""
    CREATE TABLE IF NOT EXISTS temp_working_times AS
    SELECT * FROM read_csv_auto(
        '{csv_file}', 
        delim=';',
        header=true,
        ignore_errors=true,
        sample_size=1000,
        auto_detect=true,
        decimal_separator=','
    )
 """.format(csv_file=csv_file))
 # Drop the existing table if it exists
 con.execute("DROP TABLE IF EXISTS working_times")
 # Now create the final table with the timestamp column
 con.execute("""
    CREATE TABLE working_times AS
    SELECT 
        *,
        '{timestamp}' AS import_timestamp
    FROM temp_working_times
 """.format(timestamp=import_timestamp))
 # Drop the temporary table
 con.execute("DROP TABLE IF EXISTS temp_working_times")
 # Verify the data was imported
 count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0]
 print(f"Successfully imported {count} records into the working_times table.")
 # Show the table schema
 print("\nTable Schema:")
 schema = con.execute("DESCRIBE working_times").fetchall()
 for col in schema:
    print(f"{col[0]}: {col[1]}")
 # Close the connection
 con.close()
 print("\nData import complete. Database saved to working_times.db") 
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
 duckdb==1.2.2
 pandas>=2.2.0
 python-dateutil>=2.8.2
 pytz>=2025.1
 numpy>=1.22.4 
--- a/run_analysis.py
+++ b/run_analysis.py
@@ -0,0 +1,68 @@
 #!/usr/bin/env python3
 """
 Working Time Analysis - Complete Workflow
 This script runs all three steps of the analysis in sequence:
 1. Import the data
 2. Transform the data
 3. Generate analysis reports
 """
 import os
 import sys
 import subprocess
 import time
 def run_step(script_name, step_desc):
    """Run a step in the analysis and handle errors"""
    print(f"\n{'='*60}")
    print(f"STEP: {step_desc}")
    print(f"{'='*60}")
    try:
        # Run the script and capture output
        result = subprocess.run(
            [sys.executable, script_name],
            capture_output=True,
            text=True,
            check=True
        )
        print(result.stdout)
        return True
    except subprocess.CalledProcessError as e:
        print(f"ERROR in {script_name}:")
        print(e.stderr)
        return False
 def main():
    # Check if the CSV file exists
    csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'
    if not os.path.exists(csv_file):
        print(f"Error: CSV file not found at {csv_file}")
        return False
    # Step 1: Import Data
    if not run_step('import_data.py', 'IMPORTING DATA'):
        return False
    # Wait a moment to ensure any file locks are released
    time.sleep(1)
    # Step 2: Transform Data
    if not run_step('transform_data.py', 'TRANSFORMING DATA'):
        return False
    # Wait a moment to ensure any file locks are released
    time.sleep(1)
    # Step 3: Analyze Data
    if not run_step('analyze_data.py', 'ANALYZING DATA'):
        return False
    print("\n" + "="*60)
    print("ANALYSIS COMPLETE")
    print("="*60)
    return True
 if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1) 
--- a/transform_data.py
+++ b/transform_data.py
@@ -0,0 +1,118 @@
 import duckdb
 import pandas as pd
 import os
 import time
 import datetime
 # Try to connect to the database with retry logic
 max_retries = 5
 retry_count = 0
 connected = False
 print("Trying to connect to working_times.db...")
 while not connected and retry_count < max_retries:
    try:
        # Try to connect with 'access_mode=read_only' to avoid lock conflicts
        con = duckdb.connect('working_times.db')
        connected = True
        print("Connected to working_times.db")
    except Exception as e:
        retry_count += 1
        print(f"Connection attempt {retry_count} failed: {e}")
        if retry_count < max_retries:
            print(f"Retrying in {retry_count} seconds...")
            time.sleep(retry_count)
        else:
            print("Maximum retries reached. Exiting.")
            exit(1)
 print("Transforming data...")
 # Get the transformation timestamp
 transform_timestamp = datetime.datetime.now()
 print(f"Transform timestamp: {transform_timestamp}")
 # Create a new table with transformed data
 # This query will:
 # 1. Extract date and project information
 # 2. Calculate total hours per project per day
 # 3. Format the data in a more analytical friendly way
 con.execute("""
    DROP TABLE IF EXISTS working_times_summary;
    CREATE TABLE working_times_summary AS 
    SELECT 
        Datum AS date,
        Projektname AS project_name,
        "Leistungsart (Bezeichnung)" AS activity_type,
        SUM("Zeit [h]") AS total_hours,
        SUM("Zeit [h]"/8) AS total_days,
        '{transform_timestamp}' AS transform_timestamp
    FROM working_times
    GROUP BY date, project_name, activity_type
    ORDER BY date, project_name, activity_type;
 """.format(transform_timestamp=transform_timestamp))
 # Create a table with project totals
 con.execute("""
    DROP TABLE IF EXISTS project_summary;
    CREATE TABLE project_summary AS
    SELECT
        Projektname AS project_name,
        SUM("Zeit [h]") AS total_hours,
        SUM("Zeit [h]"/8) AS total_days,
        COUNT(DISTINCT Datum) AS days_worked,
        MAX(import_timestamp) AS source_import_timestamp,
        '{transform_timestamp}' AS transform_timestamp
    FROM working_times
    GROUP BY project_name
    ORDER BY total_hours DESC;
 """.format(transform_timestamp=transform_timestamp))
 # Create a table with daily totals
 con.execute("""
    DROP TABLE IF EXISTS daily_summary;
    CREATE TABLE daily_summary AS
    SELECT
        Datum AS date,
        SUM("Zeit [h]") AS total_hours,
        COUNT(*) AS entry_count,
        COUNT(DISTINCT Projektname) AS project_count,
        MAX(import_timestamp) AS source_import_timestamp,
        '{transform_timestamp}' AS transform_timestamp
    FROM working_times
    GROUP BY date
    ORDER BY date;
 """.format(transform_timestamp=transform_timestamp))
 # Verify the data was transformed
 summary_count = con.execute("SELECT COUNT(*) FROM working_times_summary").fetchone()[0]
 project_count = con.execute("SELECT COUNT(*) FROM project_summary").fetchone()[0]
 daily_count = con.execute("SELECT COUNT(*) FROM daily_summary").fetchone()[0]
 print(f"Successfully created {summary_count} records in working_times_summary table.")
 print(f"Successfully created {project_count} records in project_summary table.")
 print(f"Successfully created {daily_count} records in daily_summary table.")
 # Print a sample of the summary table
 print("\nSample of working_times_summary table:")
 summary_sample = con.execute("SELECT date, project_name, activity_type, total_hours, total_days, transform_timestamp FROM working_times_summary LIMIT 5").fetchall()
 for row in summary_sample:
    print(row)
 # Print a sample of the project summary table
 print("\nProject summary (top 5 by hours):")
 project_sample = con.execute("SELECT project_name, total_hours, total_days, days_worked, transform_timestamp FROM project_summary LIMIT 5").fetchall()
 for row in project_sample:
    print(row)
 # Total hours worked
 total_hours = con.execute("SELECT SUM(total_hours) FROM project_summary").fetchone()[0]
 total_days = con.execute("SELECT SUM(total_days) FROM project_summary").fetchone()[0]
 print(f"\nTotal hours worked: {total_hours:.2f}")
 print(f"Total days worked: {total_days:.2f}")
 # Close the connection
 con.close()
 print("\nData transformation complete.") 
--- a/working_times.db
+++ b/working_times.db