commit bee6508f4c5bcc06281d878425e2428401f87a2a
Author: lasse <lasse@wiedemann-sh.de>
Date:   Fri Apr 18 22:34:10 2025 +0200

    first commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..264a49e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+./venv
+./data/*
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e15bae2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,70 @@
+# Working Time Analysis
+
+This project analyzes working time data from a CSV file by:
+1. Importing the data into a DuckDB database
+2. Transforming the data for analysis
+3. Generating reports based on the data
+
+## Setup
+
+### Dependencies
+
+Install the required dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+### 1. Import Data
+
+Run the import script to load the CSV data into DuckDB:
+
+```bash
+python3 import_data.py
+```
+
+This will:
+- Create a DuckDB database file `working_times.db`
+- Import the CSV data into a table
+- Add an import timestamp to each record
+
+### 2. Transform Data
+
+Run the transformation script to create analytical views:
+
+```bash
+python3 transform_data.py
+```
+
+This will:
+- Create summary tables with aggregated data
+- Convert hours to days (using 8 hours = 1 day conversion)
+- Add transformation timestamps
+
+### 3. Analyze Data
+
+Run the analysis script to generate reports:
+
+```bash
+python3 analyze_data.py
+```
+
+This will produce:
+- Overall time summary
+- Top projects by hours
+- Busiest days
+- Day distribution analysis
+- Project-activity combinations
+
+## Data Structure
+
+The analysis uses the following tables:
+
+- `working_times`: Raw imported data
+- `working_times_summary`: Per-day, per-project aggregation
+- `project_summary`: Total time per project
+- `daily_summary`: Total time per day
+
+Each derived table includes timestamps for data lineage tracking. 
\ No newline at end of file
diff --git a/analyze_data.py b/analyze_data.py
new file mode 100644
index 0000000..5bed412
--- /dev/null
+++ b/analyze_data.py
@@ -0,0 +1,157 @@
+import duckdb
+import pandas as pd
+from datetime import datetime
+
+# Connect to the database
+try:
+    con = duckdb.connect('working_times.db')
+    print("Connected to working_times.db")
+except Exception as e:
+    print(f"Error connecting to database: {e}")
+    exit(1)
+
+# Get the current analysis timestamp
+analysis_timestamp = datetime.now()
+
+# Function to format hours
+def format_hours(hours):
+    return f"{hours:.2f}h"
+
+# Function to format days
+def format_days(days):
+    return f"{days:.2f}d"
+
+# Get the date range of the data
+date_range = con.execute("""
+    SELECT MIN(date) AS start_date, MAX(date) AS end_date
+    FROM daily_summary
+""").fetchone()
+
+start_date = date_range[0]
+end_date = date_range[1]
+
+# Get the transformation timestamp (most recent)
+transform_info = con.execute("""
+    SELECT 
+        MAX(transform_timestamp) AS transform_timestamp, 
+        MAX(source_import_timestamp) AS source_import_timestamp
+    FROM daily_summary
+""").fetchone()
+
+transform_timestamp = transform_info[0]
+source_import_timestamp = transform_info[1]
+
+print("\n" + "="*60)
+print(f"WORKING TIME ANALYSIS: {start_date} to {end_date}")
+print(f"ANALYSIS TIMESTAMP: {analysis_timestamp}")
+print(f"DATA TRANSFORMATION: {transform_timestamp}")
+print(f"DATA IMPORT: {source_import_timestamp}")
+print("="*60)
+
+# Get the total hours and days worked
+totals = con.execute("""
+    SELECT SUM(total_hours) AS total_hours, SUM(total_days) AS total_days 
+    FROM project_summary
+""").fetchone()
+
+total_hours = totals[0]
+total_days = totals[1]
+num_working_days = con.execute("SELECT COUNT(*) FROM daily_summary WHERE total_hours > 0").fetchone()[0]
+avg_hours_per_day = total_hours / num_working_days if num_working_days > 0 else 0
+
+print(f"\nTOTAL HOURS: {format_hours(total_hours)}")
+print(f"TOTAL DAYS: {format_days(total_days)}")
+print(f"WORKING DAYS: {num_working_days}")
+print(f"AVG HOURS PER WORKING DAY: {format_hours(avg_hours_per_day)}")
+
+# Get the top projects by hours
+top_projects = con.execute("""
+    SELECT project_name, total_hours, total_days, days_worked
+    FROM project_summary
+    ORDER BY total_hours DESC
+    LIMIT 5
+""").fetchall()
+
+print("\n" + "-"*60)
+print("TOP 5 PROJECTS BY HOURS")
+print("-"*60)
+for i, (project, hours, days, worked_days) in enumerate(top_projects, 1):
+    percent = (hours / total_hours) * 100
+    print(f"{i}. {project}")
+    print(f"   {format_hours(hours)} ({percent:.1f}% of total) / {format_days(days)}")
+    print(f"   Across {worked_days} days, daily average: {format_hours(hours/worked_days) if worked_days > 0 else 0}")
+
+# Get the busiest days
+busiest_days = con.execute("""
+    SELECT date, total_hours, project_count
+    FROM daily_summary
+    WHERE total_hours > 0
+    ORDER BY total_hours DESC
+    LIMIT 5
+""").fetchall()
+
+print("\n" + "-"*60)
+print("TOP 5 BUSIEST DAYS")
+print("-"*60)
+for i, (date, hours, project_count) in enumerate(busiest_days, 1):
+    # Calculate day equivalent
+    day_equivalent = hours / 8  
+    print(f"{i}. {date}: {format_hours(hours)} ({format_days(day_equivalent)}) across {project_count} projects")
+
+# Get day distribution
+day_distribution = con.execute("""
+    SELECT 
+        CASE 
+            WHEN total_hours <= 4 THEN '0-4 hours'
+            WHEN total_hours <= 6 THEN '4-6 hours'
+            WHEN total_hours <= 8 THEN '6-8 hours'
+            WHEN total_hours <= 10 THEN '8-10 hours'
+            ELSE '10+ hours'
+        END AS hour_range,
+        COUNT(*) as day_count
+    FROM daily_summary
+    WHERE total_hours > 0
+    GROUP BY hour_range
+    ORDER BY 
+        CASE 
+            WHEN hour_range = '0-4 hours' THEN 1
+            WHEN hour_range = '4-6 hours' THEN 2
+            WHEN hour_range = '6-8 hours' THEN 3
+            WHEN hour_range = '8-10 hours' THEN 4
+            ELSE 5
+        END
+""").fetchall()
+
+print("\n" + "-"*60)
+print("DAY DISTRIBUTION")
+print("-"*60)
+for hour_range, day_count in day_distribution:
+    percent = (day_count / num_working_days) * 100
+    print(f"{hour_range}: {day_count} days ({percent:.1f}%)")
+
+# Print an overview of project/activity combinations
+project_activity_combo = con.execute("""
+    SELECT 
+        project_name,
+        activity_type,
+        SUM(total_hours) as hours,
+        SUM(total_days) as days
+    FROM working_times_summary
+    GROUP BY project_name, activity_type
+    ORDER BY hours DESC
+    LIMIT 10
+""").fetchall()
+
+print("\n" + "-"*60)
+print("TOP 10 PROJECT-ACTIVITY COMBINATIONS")
+print("-"*60)
+for project, activity, hours, days in project_activity_combo:
+    percent = (hours / total_hours) * 100
+    print(f"{project} - {activity}: {format_hours(hours)} ({format_days(days)}, {percent:.1f}%)")
+
+print("\n" + "="*60)
+print(f"END OF ANALYSIS - Generated at {analysis_timestamp}")
+print("="*60)
+
+# Close the connection
+con.close() 
\ No newline at end of file
diff --git a/import_data.py b/import_data.py
new file mode 100644
index 0000000..762d228
--- /dev/null
+++ b/import_data.py
@@ -0,0 +1,65 @@
+import duckdb
+import pandas as pd
+import os
+import datetime
+
+# Create connection to DuckDB
+con = duckdb.connect('working_times.db')
+
+# Path to the CSV file
+csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'
+
+# Check if file exists
+if not os.path.exists(csv_file):
+    print(f"Error: File {csv_file} not found")
+    exit(1)
+
+print(f"Importing data from {csv_file}...")
+
+# Current timestamp for the import
+import_timestamp = datetime.datetime.now()
+print(f"Import timestamp: {import_timestamp}")
+
+# First, create a temporary table with the CSV data
+con.execute("""
+    CREATE TABLE IF NOT EXISTS temp_working_times AS
+    SELECT * FROM read_csv_auto(
+        '{csv_file}', 
+        delim=';',
+        header=true,
+        ignore_errors=true,
+        sample_size=1000,
+        auto_detect=true,
+        decimal_separator=','
+    )
+""".format(csv_file=csv_file))
+
+# Drop the existing table if it exists
+con.execute("DROP TABLE IF EXISTS working_times")
+
+# Now create the final table with the timestamp column
+con.execute("""
+    CREATE TABLE working_times AS
+    SELECT 
+        *,
+        '{timestamp}' AS import_timestamp
+    FROM temp_working_times
+""".format(timestamp=import_timestamp))
+
+# Drop the temporary table
+con.execute("DROP TABLE IF EXISTS temp_working_times")
+
+# Verify the data was imported
+count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0]
+print(f"Successfully imported {count} records into the working_times table.")
+
+# Show the table schema
+print("\nTable Schema:")
+schema = con.execute("DESCRIBE working_times").fetchall()
+for col in schema:
+    print(f"{col[0]}: {col[1]}")
+
+# Close the connection
+con.close()
+
+print("\nData import complete. Database saved to working_times.db") 
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b5a002a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+duckdb==1.2.2
+pandas>=2.2.0
+python-dateutil>=2.8.2
+pytz>=2025.1
+numpy>=1.22.4 
\ No newline at end of file
diff --git a/run_analysis.py b/run_analysis.py
new file mode 100644
index 0000000..f0a9865
--- /dev/null
+++ b/run_analysis.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+"""
+Working Time Analysis - Complete Workflow
+This script runs all three steps of the analysis in sequence:
+1. Import the data
+2. Transform the data
+3. Generate analysis reports
+"""
+
+import os
+import sys
+import subprocess
+import time
+
+def run_step(script_name, step_desc):
+    """Run a step in the analysis and handle errors"""
+    print(f"\n{'='*60}")
+    print(f"STEP: {step_desc}")
+    print(f"{'='*60}")
+    
+    try:
+        # Run the script and capture output
+        result = subprocess.run(
+            [sys.executable, script_name],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        print(result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"ERROR in {script_name}:")
+        print(e.stderr)
+        return False
+
+def main():
+    # Check if the CSV file exists
+    csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'
+    if not os.path.exists(csv_file):
+        print(f"Error: CSV file not found at {csv_file}")
+        return False
+    
+    # Step 1: Import Data
+    if not run_step('import_data.py', 'IMPORTING DATA'):
+        return False
+    
+    # Wait a moment to ensure any file locks are released
+    time.sleep(1)
+    
+    # Step 2: Transform Data
+    if not run_step('transform_data.py', 'TRANSFORMING DATA'):
+        return False
+    
+    # Wait a moment to ensure any file locks are released
+    time.sleep(1)
+    
+    # Step 3: Analyze Data
+    if not run_step('analyze_data.py', 'ANALYZING DATA'):
+        return False
+    
+    print("\n" + "="*60)
+    print("ANALYSIS COMPLETE")
+    print("="*60)
+    return True
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1) 
\ No newline at end of file
diff --git a/transform_data.py b/transform_data.py
new file mode 100644
index 0000000..7883204
--- /dev/null
+++ b/transform_data.py
@@ -0,0 +1,118 @@
+import duckdb
+import pandas as pd
+import os
+import time
+import datetime
+
+# Try to connect to the database with retry logic
+max_retries = 5
+retry_count = 0
+connected = False
+
+print("Trying to connect to working_times.db...")
+
+while not connected and retry_count < max_retries:
+    try:
+        # Try to connect with 'access_mode=read_only' to avoid lock conflicts
+        con = duckdb.connect('working_times.db')
+        connected = True
+        print("Connected to working_times.db")
+    except Exception as e:
+        retry_count += 1
+        print(f"Connection attempt {retry_count} failed: {e}")
+        if retry_count < max_retries:
+            print(f"Retrying in {retry_count} seconds...")
+            time.sleep(retry_count)
+        else:
+            print("Maximum retries reached. Exiting.")
+            exit(1)
+
+print("Transforming data...")
+
+# Get the transformation timestamp
+transform_timestamp = datetime.datetime.now()
+print(f"Transform timestamp: {transform_timestamp}")
+
+# Create a new table with transformed data
+# This query will:
+# 1. Extract date and project information
+# 2. Calculate total hours per project per day
+# 3. Format the data in a more analytical friendly way
+con.execute("""
+    DROP TABLE IF EXISTS working_times_summary;
+    CREATE TABLE working_times_summary AS 
+    SELECT 
+        Datum AS date,
+        Projektname AS project_name,
+        "Leistungsart (Bezeichnung)" AS activity_type,
+        SUM("Zeit [h]") AS total_hours,
+        SUM("Zeit [h]"/8) AS total_days,
+        '{transform_timestamp}' AS transform_timestamp
+    FROM working_times
+    GROUP BY date, project_name, activity_type
+    ORDER BY date, project_name, activity_type;
+""".format(transform_timestamp=transform_timestamp))
+
+# Create a table with project totals
+con.execute("""
+    DROP TABLE IF EXISTS project_summary;
+    CREATE TABLE project_summary AS
+    SELECT
+        Projektname AS project_name,
+        SUM("Zeit [h]") AS total_hours,
+        SUM("Zeit [h]"/8) AS total_days,
+        COUNT(DISTINCT Datum) AS days_worked,
+        MAX(import_timestamp) AS source_import_timestamp,
+        '{transform_timestamp}' AS transform_timestamp
+    FROM working_times
+    GROUP BY project_name
+    ORDER BY total_hours DESC;
+""".format(transform_timestamp=transform_timestamp))
+
+# Create a table with daily totals
+con.execute("""
+    DROP TABLE IF EXISTS daily_summary;
+    CREATE TABLE daily_summary AS
+    SELECT
+        Datum AS date,
+        SUM("Zeit [h]") AS total_hours,
+        COUNT(*) AS entry_count,
+        COUNT(DISTINCT Projektname) AS project_count,
+        MAX(import_timestamp) AS source_import_timestamp,
+        '{transform_timestamp}' AS transform_timestamp
+    FROM working_times
+    GROUP BY date
+    ORDER BY date;
+""".format(transform_timestamp=transform_timestamp))
+
+# Verify the data was transformed
+summary_count = con.execute("SELECT COUNT(*) FROM working_times_summary").fetchone()[0]
+project_count = con.execute("SELECT COUNT(*) FROM project_summary").fetchone()[0]
+daily_count = con.execute("SELECT COUNT(*) FROM daily_summary").fetchone()[0]
+
+print(f"Successfully created {summary_count} records in working_times_summary table.")
+print(f"Successfully created {project_count} records in project_summary table.")
+print(f"Successfully created {daily_count} records in daily_summary table.")
+
+# Print a sample of the summary table
+print("\nSample of working_times_summary table:")
+summary_sample = con.execute("SELECT date, project_name, activity_type, total_hours, total_days, transform_timestamp FROM working_times_summary LIMIT 5").fetchall()
+for row in summary_sample:
+    print(row)
+
+# Print a sample of the project summary table
+print("\nProject summary (top 5 by hours):")
+project_sample = con.execute("SELECT project_name, total_hours, total_days, days_worked, transform_timestamp FROM project_summary LIMIT 5").fetchall()
+for row in project_sample:
+    print(row)
+
+# Total hours worked
+total_hours = con.execute("SELECT SUM(total_hours) FROM project_summary").fetchone()[0]
+total_days = con.execute("SELECT SUM(total_days) FROM project_summary").fetchone()[0]
+print(f"\nTotal hours worked: {total_hours:.2f}")
+print(f"Total days worked: {total_days:.2f}")
+
+# Close the connection
+con.close()
+
+print("\nData transformation complete.") 
\ No newline at end of file
diff --git a/working_times.db b/working_times.db
new file mode 100644
index 0000000..4db2a6f
Binary files /dev/null and b/working_times.db differ