first commit
This commit is contained in:
commit
bee6508f4c
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
./venv
|
||||||
|
./data/*
|
||||||
70
README.md
Normal file
70
README.md
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
# Working Time Analysis
|
||||||
|
|
||||||
|
This project analyzes working time data from a CSV file by:
|
||||||
|
1. Importing the data into a DuckDB database
|
||||||
|
2. Transforming the data for analysis
|
||||||
|
3. Generating reports based on the data
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
### Dependencies
|
||||||
|
|
||||||
|
Install the required dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### 1. Import Data
|
||||||
|
|
||||||
|
Run the import script to load the CSV data into DuckDB:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 import_data.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This will:
|
||||||
|
- Create a DuckDB database file `working_times.db`
|
||||||
|
- Import the CSV data into a table
|
||||||
|
- Add an import timestamp to each record
|
||||||
|
|
||||||
|
### 2. Transform Data
|
||||||
|
|
||||||
|
Run the transformation script to create analytical views:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 transform_data.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This will:
|
||||||
|
- Create summary tables with aggregated data
|
||||||
|
- Convert hours to days (using 8 hours = 1 day conversion)
|
||||||
|
- Add transformation timestamps
|
||||||
|
|
||||||
|
### 3. Analyze Data
|
||||||
|
|
||||||
|
Run the analysis script to generate reports:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 analyze_data.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This will produce:
|
||||||
|
- Overall time summary
|
||||||
|
- Top projects by hours
|
||||||
|
- Busiest days
|
||||||
|
- Day distribution analysis
|
||||||
|
- Project-activity combinations
|
||||||
|
|
||||||
|
## Data Structure
|
||||||
|
|
||||||
|
The analysis uses the following tables:
|
||||||
|
|
||||||
|
- `working_times`: Raw imported data
|
||||||
|
- `working_times_summary`: Per-day, per-project aggregation
|
||||||
|
- `project_summary`: Total time per project
|
||||||
|
- `daily_summary`: Total time per day
|
||||||
|
|
||||||
|
Each derived table includes timestamps for data lineage tracking.
|
||||||
157
analyze_data.py
Normal file
157
analyze_data.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
import duckdb
|
||||||
|
import pandas as pd
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Connect to the database
|
||||||
|
try:
|
||||||
|
con = duckdb.connect('working_times.db')
|
||||||
|
print("Connected to working_times.db")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error connecting to database: {e}")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Get the current analysis timestamp
|
||||||
|
analysis_timestamp = datetime.now()
|
||||||
|
|
||||||
|
# Function to format hours
|
||||||
|
def format_hours(hours):
|
||||||
|
return f"{hours:.2f}h"
|
||||||
|
|
||||||
|
# Function to format days
|
||||||
|
def format_days(days):
|
||||||
|
return f"{days:.2f}d"
|
||||||
|
|
||||||
|
# Get the date range of the data
|
||||||
|
date_range = con.execute("""
|
||||||
|
SELECT MIN(date) AS start_date, MAX(date) AS end_date
|
||||||
|
FROM daily_summary
|
||||||
|
""").fetchone()
|
||||||
|
|
||||||
|
start_date = date_range[0]
|
||||||
|
end_date = date_range[1]
|
||||||
|
|
||||||
|
# Get the transformation timestamp (most recent)
|
||||||
|
transform_info = con.execute("""
|
||||||
|
SELECT
|
||||||
|
MAX(transform_timestamp) AS transform_timestamp,
|
||||||
|
MAX(source_import_timestamp) AS source_import_timestamp
|
||||||
|
FROM daily_summary
|
||||||
|
""").fetchone()
|
||||||
|
|
||||||
|
transform_timestamp = transform_info[0]
|
||||||
|
source_import_timestamp = transform_info[1]
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print(f"WORKING TIME ANALYSIS: {start_date} to {end_date}")
|
||||||
|
print(f"ANALYSIS TIMESTAMP: {analysis_timestamp}")
|
||||||
|
print(f"DATA TRANSFORMATION: {transform_timestamp}")
|
||||||
|
print(f"DATA IMPORT: {source_import_timestamp}")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# Get the total hours and days worked
|
||||||
|
totals = con.execute("""
|
||||||
|
SELECT SUM(total_hours) AS total_hours, SUM(total_days) AS total_days
|
||||||
|
FROM project_summary
|
||||||
|
""").fetchone()
|
||||||
|
|
||||||
|
total_hours = totals[0]
|
||||||
|
total_days = totals[1]
|
||||||
|
num_working_days = con.execute("SELECT COUNT(*) FROM daily_summary WHERE total_hours > 0").fetchone()[0]
|
||||||
|
avg_hours_per_day = total_hours / num_working_days if num_working_days > 0 else 0
|
||||||
|
|
||||||
|
print(f"\nTOTAL HOURS: {format_hours(total_hours)}")
|
||||||
|
print(f"TOTAL DAYS: {format_days(total_days)}")
|
||||||
|
print(f"WORKING DAYS: {num_working_days}")
|
||||||
|
print(f"AVG HOURS PER WORKING DAY: {format_hours(avg_hours_per_day)}")
|
||||||
|
|
||||||
|
# Get the top projects by hours
|
||||||
|
top_projects = con.execute("""
|
||||||
|
SELECT project_name, total_hours, total_days, days_worked
|
||||||
|
FROM project_summary
|
||||||
|
ORDER BY total_hours DESC
|
||||||
|
LIMIT 5
|
||||||
|
""").fetchall()
|
||||||
|
|
||||||
|
print("\n" + "-"*60)
|
||||||
|
print("TOP 5 PROJECTS BY HOURS")
|
||||||
|
print("-"*60)
|
||||||
|
for i, (project, hours, days, worked_days) in enumerate(top_projects, 1):
|
||||||
|
percent = (hours / total_hours) * 100
|
||||||
|
print(f"{i}. {project}")
|
||||||
|
print(f" {format_hours(hours)} ({percent:.1f}% of total) / {format_days(days)}")
|
||||||
|
print(f" Across {worked_days} days, daily average: {format_hours(hours/worked_days) if worked_days > 0 else 0}")
|
||||||
|
|
||||||
|
# Get the busiest days
|
||||||
|
busiest_days = con.execute("""
|
||||||
|
SELECT date, total_hours, project_count
|
||||||
|
FROM daily_summary
|
||||||
|
WHERE total_hours > 0
|
||||||
|
ORDER BY total_hours DESC
|
||||||
|
LIMIT 5
|
||||||
|
""").fetchall()
|
||||||
|
|
||||||
|
print("\n" + "-"*60)
|
||||||
|
print("TOP 5 BUSIEST DAYS")
|
||||||
|
print("-"*60)
|
||||||
|
for i, (date, hours, project_count) in enumerate(busiest_days, 1):
|
||||||
|
# Calculate day equivalent
|
||||||
|
day_equivalent = hours / 8
|
||||||
|
print(f"{i}. {date}: {format_hours(hours)} ({format_days(day_equivalent)}) across {project_count} projects")
|
||||||
|
|
||||||
|
# Get day distribution
|
||||||
|
day_distribution = con.execute("""
|
||||||
|
SELECT
|
||||||
|
CASE
|
||||||
|
WHEN total_hours <= 4 THEN '0-4 hours'
|
||||||
|
WHEN total_hours <= 6 THEN '4-6 hours'
|
||||||
|
WHEN total_hours <= 8 THEN '6-8 hours'
|
||||||
|
WHEN total_hours <= 10 THEN '8-10 hours'
|
||||||
|
ELSE '10+ hours'
|
||||||
|
END AS hour_range,
|
||||||
|
COUNT(*) as day_count
|
||||||
|
FROM daily_summary
|
||||||
|
WHERE total_hours > 0
|
||||||
|
GROUP BY hour_range
|
||||||
|
ORDER BY
|
||||||
|
CASE
|
||||||
|
WHEN hour_range = '0-4 hours' THEN 1
|
||||||
|
WHEN hour_range = '4-6 hours' THEN 2
|
||||||
|
WHEN hour_range = '6-8 hours' THEN 3
|
||||||
|
WHEN hour_range = '8-10 hours' THEN 4
|
||||||
|
ELSE 5
|
||||||
|
END
|
||||||
|
""").fetchall()
|
||||||
|
|
||||||
|
print("\n" + "-"*60)
|
||||||
|
print("DAY DISTRIBUTION")
|
||||||
|
print("-"*60)
|
||||||
|
for hour_range, day_count in day_distribution:
|
||||||
|
percent = (day_count / num_working_days) * 100
|
||||||
|
print(f"{hour_range}: {day_count} days ({percent:.1f}%)")
|
||||||
|
|
||||||
|
# Print an overview of project/activity combinations
|
||||||
|
project_activity_combo = con.execute("""
|
||||||
|
SELECT
|
||||||
|
project_name,
|
||||||
|
activity_type,
|
||||||
|
SUM(total_hours) as hours,
|
||||||
|
SUM(total_days) as days
|
||||||
|
FROM working_times_summary
|
||||||
|
GROUP BY project_name, activity_type
|
||||||
|
ORDER BY hours DESC
|
||||||
|
LIMIT 10
|
||||||
|
""").fetchall()
|
||||||
|
|
||||||
|
print("\n" + "-"*60)
|
||||||
|
print("TOP 10 PROJECT-ACTIVITY COMBINATIONS")
|
||||||
|
print("-"*60)
|
||||||
|
for project, activity, hours, days in project_activity_combo:
|
||||||
|
percent = (hours / total_hours) * 100
|
||||||
|
print(f"{project} - {activity}: {format_hours(hours)} ({format_days(days)}, {percent:.1f}%)")
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print(f"END OF ANALYSIS - Generated at {analysis_timestamp}")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# Close the connection
|
||||||
|
con.close()
|
||||||
65
import_data.py
Normal file
65
import_data.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
import duckdb
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
# Create connection to DuckDB
|
||||||
|
con = duckdb.connect('working_times.db')
|
||||||
|
|
||||||
|
# Path to the CSV file
|
||||||
|
csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'
|
||||||
|
|
||||||
|
# Check if file exists
|
||||||
|
if not os.path.exists(csv_file):
|
||||||
|
print(f"Error: File {csv_file} not found")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
print(f"Importing data from {csv_file}...")
|
||||||
|
|
||||||
|
# Current timestamp for the import
|
||||||
|
import_timestamp = datetime.datetime.now()
|
||||||
|
print(f"Import timestamp: {import_timestamp}")
|
||||||
|
|
||||||
|
# First, create a temporary table with the CSV data
|
||||||
|
con.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS temp_working_times AS
|
||||||
|
SELECT * FROM read_csv_auto(
|
||||||
|
'{csv_file}',
|
||||||
|
delim=';',
|
||||||
|
header=true,
|
||||||
|
ignore_errors=true,
|
||||||
|
sample_size=1000,
|
||||||
|
auto_detect=true,
|
||||||
|
decimal_separator=','
|
||||||
|
)
|
||||||
|
""".format(csv_file=csv_file))
|
||||||
|
|
||||||
|
# Drop the existing table if it exists
|
||||||
|
con.execute("DROP TABLE IF EXISTS working_times")
|
||||||
|
|
||||||
|
# Now create the final table with the timestamp column
|
||||||
|
con.execute("""
|
||||||
|
CREATE TABLE working_times AS
|
||||||
|
SELECT
|
||||||
|
*,
|
||||||
|
'{timestamp}' AS import_timestamp
|
||||||
|
FROM temp_working_times
|
||||||
|
""".format(timestamp=import_timestamp))
|
||||||
|
|
||||||
|
# Drop the temporary table
|
||||||
|
con.execute("DROP TABLE IF EXISTS temp_working_times")
|
||||||
|
|
||||||
|
# Verify the data was imported
|
||||||
|
count = con.execute("SELECT COUNT(*) FROM working_times").fetchone()[0]
|
||||||
|
print(f"Successfully imported {count} records into the working_times table.")
|
||||||
|
|
||||||
|
# Show the table schema
|
||||||
|
print("\nTable Schema:")
|
||||||
|
schema = con.execute("DESCRIBE working_times").fetchall()
|
||||||
|
for col in schema:
|
||||||
|
print(f"{col[0]}: {col[1]}")
|
||||||
|
|
||||||
|
# Close the connection
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
print("\nData import complete. Database saved to working_times.db")
|
||||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
duckdb==1.2.2
|
||||||
|
pandas>=2.2.0
|
||||||
|
python-dateutil>=2.8.2
|
||||||
|
pytz>=2025.1
|
||||||
|
numpy>=1.22.4
|
||||||
68
run_analysis.py
Normal file
68
run_analysis.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Working Time Analysis - Complete Workflow
|
||||||
|
This script runs all three steps of the analysis in sequence:
|
||||||
|
1. Import the data
|
||||||
|
2. Transform the data
|
||||||
|
3. Generate analysis reports
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
|
||||||
|
def run_step(script_name, step_desc):
|
||||||
|
"""Run a step in the analysis and handle errors"""
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"STEP: {step_desc}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Run the script and capture output
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, script_name],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"ERROR in {script_name}:")
|
||||||
|
print(e.stderr)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Check if the CSV file exists
|
||||||
|
csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'
|
||||||
|
if not os.path.exists(csv_file):
|
||||||
|
print(f"Error: CSV file not found at {csv_file}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Step 1: Import Data
|
||||||
|
if not run_step('import_data.py', 'IMPORTING DATA'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Wait a moment to ensure any file locks are released
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Step 2: Transform Data
|
||||||
|
if not run_step('transform_data.py', 'TRANSFORMING DATA'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Wait a moment to ensure any file locks are released
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Step 3: Analyze Data
|
||||||
|
if not run_step('analyze_data.py', 'ANALYZING DATA'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("ANALYSIS COMPLETE")
|
||||||
|
print("="*60)
|
||||||
|
return True
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
success = main()
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
118
transform_data.py
Normal file
118
transform_data.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
import duckdb
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
# Try to connect to the database with retry logic
|
||||||
|
max_retries = 5
|
||||||
|
retry_count = 0
|
||||||
|
connected = False
|
||||||
|
|
||||||
|
print("Trying to connect to working_times.db...")
|
||||||
|
|
||||||
|
while not connected and retry_count < max_retries:
|
||||||
|
try:
|
||||||
|
# Try to connect with 'access_mode=read_only' to avoid lock conflicts
|
||||||
|
con = duckdb.connect('working_times.db')
|
||||||
|
connected = True
|
||||||
|
print("Connected to working_times.db")
|
||||||
|
except Exception as e:
|
||||||
|
retry_count += 1
|
||||||
|
print(f"Connection attempt {retry_count} failed: {e}")
|
||||||
|
if retry_count < max_retries:
|
||||||
|
print(f"Retrying in {retry_count} seconds...")
|
||||||
|
time.sleep(retry_count)
|
||||||
|
else:
|
||||||
|
print("Maximum retries reached. Exiting.")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
print("Transforming data...")
|
||||||
|
|
||||||
|
# Get the transformation timestamp
|
||||||
|
transform_timestamp = datetime.datetime.now()
|
||||||
|
print(f"Transform timestamp: {transform_timestamp}")
|
||||||
|
|
||||||
|
# Create a new table with transformed data
|
||||||
|
# This query will:
|
||||||
|
# 1. Extract date and project information
|
||||||
|
# 2. Calculate total hours per project per day
|
||||||
|
# 3. Format the data in a more analytical friendly way
|
||||||
|
con.execute("""
|
||||||
|
DROP TABLE IF EXISTS working_times_summary;
|
||||||
|
CREATE TABLE working_times_summary AS
|
||||||
|
SELECT
|
||||||
|
Datum AS date,
|
||||||
|
Projektname AS project_name,
|
||||||
|
"Leistungsart (Bezeichnung)" AS activity_type,
|
||||||
|
SUM("Zeit [h]") AS total_hours,
|
||||||
|
SUM("Zeit [h]"/8) AS total_days,
|
||||||
|
'{transform_timestamp}' AS transform_timestamp
|
||||||
|
FROM working_times
|
||||||
|
GROUP BY date, project_name, activity_type
|
||||||
|
ORDER BY date, project_name, activity_type;
|
||||||
|
""".format(transform_timestamp=transform_timestamp))
|
||||||
|
|
||||||
|
# Create a table with project totals
|
||||||
|
con.execute("""
|
||||||
|
DROP TABLE IF EXISTS project_summary;
|
||||||
|
CREATE TABLE project_summary AS
|
||||||
|
SELECT
|
||||||
|
Projektname AS project_name,
|
||||||
|
SUM("Zeit [h]") AS total_hours,
|
||||||
|
SUM("Zeit [h]"/8) AS total_days,
|
||||||
|
COUNT(DISTINCT Datum) AS days_worked,
|
||||||
|
MAX(import_timestamp) AS source_import_timestamp,
|
||||||
|
'{transform_timestamp}' AS transform_timestamp
|
||||||
|
FROM working_times
|
||||||
|
GROUP BY project_name
|
||||||
|
ORDER BY total_hours DESC;
|
||||||
|
""".format(transform_timestamp=transform_timestamp))
|
||||||
|
|
||||||
|
# Create a table with daily totals
|
||||||
|
con.execute("""
|
||||||
|
DROP TABLE IF EXISTS daily_summary;
|
||||||
|
CREATE TABLE daily_summary AS
|
||||||
|
SELECT
|
||||||
|
Datum AS date,
|
||||||
|
SUM("Zeit [h]") AS total_hours,
|
||||||
|
COUNT(*) AS entry_count,
|
||||||
|
COUNT(DISTINCT Projektname) AS project_count,
|
||||||
|
MAX(import_timestamp) AS source_import_timestamp,
|
||||||
|
'{transform_timestamp}' AS transform_timestamp
|
||||||
|
FROM working_times
|
||||||
|
GROUP BY date
|
||||||
|
ORDER BY date;
|
||||||
|
""".format(transform_timestamp=transform_timestamp))
|
||||||
|
|
||||||
|
# Verify the data was transformed
|
||||||
|
summary_count = con.execute("SELECT COUNT(*) FROM working_times_summary").fetchone()[0]
|
||||||
|
project_count = con.execute("SELECT COUNT(*) FROM project_summary").fetchone()[0]
|
||||||
|
daily_count = con.execute("SELECT COUNT(*) FROM daily_summary").fetchone()[0]
|
||||||
|
|
||||||
|
print(f"Successfully created {summary_count} records in working_times_summary table.")
|
||||||
|
print(f"Successfully created {project_count} records in project_summary table.")
|
||||||
|
print(f"Successfully created {daily_count} records in daily_summary table.")
|
||||||
|
|
||||||
|
# Print a sample of the summary table
|
||||||
|
print("\nSample of working_times_summary table:")
|
||||||
|
summary_sample = con.execute("SELECT date, project_name, activity_type, total_hours, total_days, transform_timestamp FROM working_times_summary LIMIT 5").fetchall()
|
||||||
|
for row in summary_sample:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
# Print a sample of the project summary table
|
||||||
|
print("\nProject summary (top 5 by hours):")
|
||||||
|
project_sample = con.execute("SELECT project_name, total_hours, total_days, days_worked, transform_timestamp FROM project_summary LIMIT 5").fetchall()
|
||||||
|
for row in project_sample:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
# Total hours worked
|
||||||
|
total_hours = con.execute("SELECT SUM(total_hours) FROM project_summary").fetchone()[0]
|
||||||
|
total_days = con.execute("SELECT SUM(total_days) FROM project_summary").fetchone()[0]
|
||||||
|
print(f"\nTotal hours worked: {total_hours:.2f}")
|
||||||
|
print(f"Total days worked: {total_days:.2f}")
|
||||||
|
|
||||||
|
# Close the connection
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
print("\nData transformation complete.")
|
||||||
BIN
working_times.db
Normal file
BIN
working_times.db
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user