delete unsed scripts
This commit is contained in:
parent
828e0a5c4b
commit
1671e169f8
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
./venv
|
||||
data/*
|
||||
!data/.gitkeep
|
||||
|
||||
working_times.db
|
||||
42
README.md
42
README.md
@ -2,9 +2,6 @@
|
||||
|
||||
This project analyzes working time data from a CSV file by:
|
||||
1. Importing the data into a DuckDB database
|
||||
2. Transforming the data for analysis
|
||||
3. Generating reports based on the data
|
||||
|
||||
## Setup
|
||||
|
||||
### Dependencies
|
||||
@ -29,42 +26,3 @@ This will:
|
||||
- Create a DuckDB database file `working_times.db`
|
||||
- Import the CSV data into a table
|
||||
- Add an import timestamp to each record
|
||||
|
||||
### 2. Transform Data
|
||||
|
||||
Run the transformation script to create analytical views:
|
||||
|
||||
```bash
|
||||
python3 transform_data.py
|
||||
```
|
||||
|
||||
This will:
|
||||
- Create summary tables with aggregated data
|
||||
- Convert hours to days (using 8 hours = 1 day conversion)
|
||||
- Add transformation timestamps
|
||||
|
||||
### 3. Analyze Data
|
||||
|
||||
Run the analysis script to generate reports:
|
||||
|
||||
```bash
|
||||
python3 analyze_data.py
|
||||
```
|
||||
|
||||
This will produce:
|
||||
- Overall time summary
|
||||
- Top projects by hours
|
||||
- Busiest days
|
||||
- Day distribution analysis
|
||||
- Project-activity combinations
|
||||
|
||||
## Data Structure
|
||||
|
||||
The analysis uses the following tables:
|
||||
|
||||
- `working_times`: Raw imported data
|
||||
- `working_times_summary`: Per-day, per-project aggregation
|
||||
- `project_summary`: Total time per project
|
||||
- `daily_summary`: Total time per day
|
||||
|
||||
Each derived table includes timestamps for data lineage tracking.
|
||||
157
analyze_data.py
157
analyze_data.py
@ -1,157 +0,0 @@
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
|
||||
# Connect to the database
|
||||
try:
|
||||
con = duckdb.connect('working_times.db')
|
||||
print("Connected to working_times.db")
|
||||
except Exception as e:
|
||||
print(f"Error connecting to database: {e}")
|
||||
exit(1)
|
||||
|
||||
# Get the current analysis timestamp
|
||||
analysis_timestamp = datetime.now()
|
||||
|
||||
# Function to format hours
|
||||
def format_hours(hours):
|
||||
return f"{hours:.2f}h"
|
||||
|
||||
# Function to format days
|
||||
def format_days(days):
|
||||
return f"{days:.2f}d"
|
||||
|
||||
# Get the date range of the data
|
||||
date_range = con.execute("""
|
||||
SELECT MIN(date) AS start_date, MAX(date) AS end_date
|
||||
FROM daily_summary
|
||||
""").fetchone()
|
||||
|
||||
start_date = date_range[0]
|
||||
end_date = date_range[1]
|
||||
|
||||
# Get the transformation timestamp (most recent)
|
||||
transform_info = con.execute("""
|
||||
SELECT
|
||||
MAX(transform_timestamp) AS transform_timestamp,
|
||||
MAX(source_import_timestamp) AS source_import_timestamp
|
||||
FROM daily_summary
|
||||
""").fetchone()
|
||||
|
||||
transform_timestamp = transform_info[0]
|
||||
source_import_timestamp = transform_info[1]
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(f"WORKING TIME ANALYSIS: {start_date} to {end_date}")
|
||||
print(f"ANALYSIS TIMESTAMP: {analysis_timestamp}")
|
||||
print(f"DATA TRANSFORMATION: {transform_timestamp}")
|
||||
print(f"DATA IMPORT: {source_import_timestamp}")
|
||||
print("="*60)
|
||||
|
||||
# Get the total hours and days worked
|
||||
totals = con.execute("""
|
||||
SELECT SUM(total_hours) AS total_hours, SUM(total_days) AS total_days
|
||||
FROM project_summary
|
||||
""").fetchone()
|
||||
|
||||
total_hours = totals[0]
|
||||
total_days = totals[1]
|
||||
num_working_days = con.execute("SELECT COUNT(*) FROM daily_summary WHERE total_hours > 0").fetchone()[0]
|
||||
avg_hours_per_day = total_hours / num_working_days if num_working_days > 0 else 0
|
||||
|
||||
print(f"\nTOTAL HOURS: {format_hours(total_hours)}")
|
||||
print(f"TOTAL DAYS: {format_days(total_days)}")
|
||||
print(f"WORKING DAYS: {num_working_days}")
|
||||
print(f"AVG HOURS PER WORKING DAY: {format_hours(avg_hours_per_day)}")
|
||||
|
||||
# Get the top projects by hours
|
||||
top_projects = con.execute("""
|
||||
SELECT project_name, total_hours, total_days, days_worked
|
||||
FROM project_summary
|
||||
ORDER BY total_hours DESC
|
||||
LIMIT 5
|
||||
""").fetchall()
|
||||
|
||||
print("\n" + "-"*60)
|
||||
print("TOP 5 PROJECTS BY HOURS")
|
||||
print("-"*60)
|
||||
for i, (project, hours, days, worked_days) in enumerate(top_projects, 1):
|
||||
percent = (hours / total_hours) * 100
|
||||
print(f"{i}. {project}")
|
||||
print(f" {format_hours(hours)} ({percent:.1f}% of total) / {format_days(days)}")
|
||||
print(f" Across {worked_days} days, daily average: {format_hours(hours/worked_days) if worked_days > 0 else 0}")
|
||||
|
||||
# Get the busiest days
|
||||
busiest_days = con.execute("""
|
||||
SELECT date, total_hours, project_count
|
||||
FROM daily_summary
|
||||
WHERE total_hours > 0
|
||||
ORDER BY total_hours DESC
|
||||
LIMIT 5
|
||||
""").fetchall()
|
||||
|
||||
print("\n" + "-"*60)
|
||||
print("TOP 5 BUSIEST DAYS")
|
||||
print("-"*60)
|
||||
for i, (date, hours, project_count) in enumerate(busiest_days, 1):
|
||||
# Calculate day equivalent
|
||||
day_equivalent = hours / 8
|
||||
print(f"{i}. {date}: {format_hours(hours)} ({format_days(day_equivalent)}) across {project_count} projects")
|
||||
|
||||
# Get day distribution
|
||||
day_distribution = con.execute("""
|
||||
SELECT
|
||||
CASE
|
||||
WHEN total_hours <= 4 THEN '0-4 hours'
|
||||
WHEN total_hours <= 6 THEN '4-6 hours'
|
||||
WHEN total_hours <= 8 THEN '6-8 hours'
|
||||
WHEN total_hours <= 10 THEN '8-10 hours'
|
||||
ELSE '10+ hours'
|
||||
END AS hour_range,
|
||||
COUNT(*) as day_count
|
||||
FROM daily_summary
|
||||
WHERE total_hours > 0
|
||||
GROUP BY hour_range
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN hour_range = '0-4 hours' THEN 1
|
||||
WHEN hour_range = '4-6 hours' THEN 2
|
||||
WHEN hour_range = '6-8 hours' THEN 3
|
||||
WHEN hour_range = '8-10 hours' THEN 4
|
||||
ELSE 5
|
||||
END
|
||||
""").fetchall()
|
||||
|
||||
print("\n" + "-"*60)
|
||||
print("DAY DISTRIBUTION")
|
||||
print("-"*60)
|
||||
for hour_range, day_count in day_distribution:
|
||||
percent = (day_count / num_working_days) * 100
|
||||
print(f"{hour_range}: {day_count} days ({percent:.1f}%)")
|
||||
|
||||
# Print an overview of project/activity combinations
|
||||
project_activity_combo = con.execute("""
|
||||
SELECT
|
||||
project_name,
|
||||
activity_type,
|
||||
SUM(total_hours) as hours,
|
||||
SUM(total_days) as days
|
||||
FROM working_times_summary
|
||||
GROUP BY project_name, activity_type
|
||||
ORDER BY hours DESC
|
||||
LIMIT 10
|
||||
""").fetchall()
|
||||
|
||||
print("\n" + "-"*60)
|
||||
print("TOP 10 PROJECT-ACTIVITY COMBINATIONS")
|
||||
print("-"*60)
|
||||
for project, activity, hours, days in project_activity_combo:
|
||||
percent = (hours / total_hours) * 100
|
||||
print(f"{project} - {activity}: {format_hours(hours)} ({format_days(days)}, {percent:.1f}%)")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(f"END OF ANALYSIS - Generated at {analysis_timestamp}")
|
||||
print("="*60)
|
||||
|
||||
# Close the connection
|
||||
con.close()
|
||||
@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Working Time Analysis - Complete Workflow
|
||||
This script runs all three steps of the analysis in sequence:
|
||||
1. Import the data
|
||||
2. Transform the data
|
||||
3. Generate analysis reports
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
def run_step(script_name, step_desc):
|
||||
"""Run a step in the analysis and handle errors"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"STEP: {step_desc}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
try:
|
||||
# Run the script and capture output
|
||||
result = subprocess.run(
|
||||
[sys.executable, script_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"ERROR in {script_name}:")
|
||||
print(e.stderr)
|
||||
return False
|
||||
|
||||
def main():
|
||||
# Check if the CSV file exists
|
||||
csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv'
|
||||
if not os.path.exists(csv_file):
|
||||
print(f"Error: CSV file not found at {csv_file}")
|
||||
return False
|
||||
|
||||
# Step 1: Import Data
|
||||
if not run_step('import_data.py', 'IMPORTING DATA'):
|
||||
return False
|
||||
|
||||
# Wait a moment to ensure any file locks are released
|
||||
time.sleep(1)
|
||||
|
||||
# Step 2: Transform Data
|
||||
if not run_step('transform_data.py', 'TRANSFORMING DATA'):
|
||||
return False
|
||||
|
||||
# Wait a moment to ensure any file locks are released
|
||||
time.sleep(1)
|
||||
|
||||
# Step 3: Analyze Data
|
||||
if not run_step('analyze_data.py', 'ANALYZING DATA'):
|
||||
return False
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("ANALYSIS COMPLETE")
|
||||
print("="*60)
|
||||
return True
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
sys.exit(0 if success else 1)
|
||||
@ -1,118 +0,0 @@
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
import os
|
||||
import time
|
||||
import datetime
|
||||
|
||||
# Try to connect to the database with retry logic
|
||||
max_retries = 5
|
||||
retry_count = 0
|
||||
connected = False
|
||||
|
||||
print("Trying to connect to working_times.db...")
|
||||
|
||||
while not connected and retry_count < max_retries:
|
||||
try:
|
||||
# Try to connect with 'access_mode=read_only' to avoid lock conflicts
|
||||
con = duckdb.connect('working_times.db')
|
||||
connected = True
|
||||
print("Connected to working_times.db")
|
||||
except Exception as e:
|
||||
retry_count += 1
|
||||
print(f"Connection attempt {retry_count} failed: {e}")
|
||||
if retry_count < max_retries:
|
||||
print(f"Retrying in {retry_count} seconds...")
|
||||
time.sleep(retry_count)
|
||||
else:
|
||||
print("Maximum retries reached. Exiting.")
|
||||
exit(1)
|
||||
|
||||
print("Transforming data...")
|
||||
|
||||
# Get the transformation timestamp
|
||||
transform_timestamp = datetime.datetime.now()
|
||||
print(f"Transform timestamp: {transform_timestamp}")
|
||||
|
||||
# Create a new table with transformed data
|
||||
# This query will:
|
||||
# 1. Extract date and project information
|
||||
# 2. Calculate total hours per project per day
|
||||
# 3. Format the data in a more analytical friendly way
|
||||
con.execute("""
|
||||
DROP TABLE IF EXISTS working_times_summary;
|
||||
CREATE TABLE working_times_summary AS
|
||||
SELECT
|
||||
Datum AS date,
|
||||
Projektname AS project_name,
|
||||
"Leistungsart (Bezeichnung)" AS activity_type,
|
||||
SUM("Zeit [h]") AS total_hours,
|
||||
SUM("Zeit [h]"/8) AS total_days,
|
||||
'{transform_timestamp}' AS transform_timestamp
|
||||
FROM working_times
|
||||
GROUP BY date, project_name, activity_type
|
||||
ORDER BY date, project_name, activity_type;
|
||||
""".format(transform_timestamp=transform_timestamp))
|
||||
|
||||
# Create a table with project totals
|
||||
con.execute("""
|
||||
DROP TABLE IF EXISTS project_summary;
|
||||
CREATE TABLE project_summary AS
|
||||
SELECT
|
||||
Projektname AS project_name,
|
||||
SUM("Zeit [h]") AS total_hours,
|
||||
SUM("Zeit [h]"/8) AS total_days,
|
||||
COUNT(DISTINCT Datum) AS days_worked,
|
||||
MAX(import_timestamp) AS source_import_timestamp,
|
||||
'{transform_timestamp}' AS transform_timestamp
|
||||
FROM working_times
|
||||
GROUP BY project_name
|
||||
ORDER BY total_hours DESC;
|
||||
""".format(transform_timestamp=transform_timestamp))
|
||||
|
||||
# Create a table with daily totals
|
||||
con.execute("""
|
||||
DROP TABLE IF EXISTS daily_summary;
|
||||
CREATE TABLE daily_summary AS
|
||||
SELECT
|
||||
Datum AS date,
|
||||
SUM("Zeit [h]") AS total_hours,
|
||||
COUNT(*) AS entry_count,
|
||||
COUNT(DISTINCT Projektname) AS project_count,
|
||||
MAX(import_timestamp) AS source_import_timestamp,
|
||||
'{transform_timestamp}' AS transform_timestamp
|
||||
FROM working_times
|
||||
GROUP BY date
|
||||
ORDER BY date;
|
||||
""".format(transform_timestamp=transform_timestamp))
|
||||
|
||||
# Verify the data was transformed
|
||||
summary_count = con.execute("SELECT COUNT(*) FROM working_times_summary").fetchone()[0]
|
||||
project_count = con.execute("SELECT COUNT(*) FROM project_summary").fetchone()[0]
|
||||
daily_count = con.execute("SELECT COUNT(*) FROM daily_summary").fetchone()[0]
|
||||
|
||||
print(f"Successfully created {summary_count} records in working_times_summary table.")
|
||||
print(f"Successfully created {project_count} records in project_summary table.")
|
||||
print(f"Successfully created {daily_count} records in daily_summary table.")
|
||||
|
||||
# Print a sample of the summary table
|
||||
print("\nSample of working_times_summary table:")
|
||||
summary_sample = con.execute("SELECT date, project_name, activity_type, total_hours, total_days, transform_timestamp FROM working_times_summary LIMIT 5").fetchall()
|
||||
for row in summary_sample:
|
||||
print(row)
|
||||
|
||||
# Print a sample of the project summary table
|
||||
print("\nProject summary (top 5 by hours):")
|
||||
project_sample = con.execute("SELECT project_name, total_hours, total_days, days_worked, transform_timestamp FROM project_summary LIMIT 5").fetchall()
|
||||
for row in project_sample:
|
||||
print(row)
|
||||
|
||||
# Total hours worked
|
||||
total_hours = con.execute("SELECT SUM(total_hours) FROM project_summary").fetchone()[0]
|
||||
total_days = con.execute("SELECT SUM(total_days) FROM project_summary").fetchone()[0]
|
||||
print(f"\nTotal hours worked: {total_hours:.2f}")
|
||||
print(f"Total days worked: {total_days:.2f}")
|
||||
|
||||
# Close the connection
|
||||
con.close()
|
||||
|
||||
print("\nData transformation complete.")
|
||||
Loading…
x
Reference in New Issue
Block a user