From 1671e169f834a9c1f3450c0b11075cae7acefea2 Mon Sep 17 00:00:00 2001 From: lasse Date: Fri, 18 Apr 2025 22:41:42 +0200 Subject: [PATCH] delete unsed scripts --- .gitignore | 4 +- README.md | 42 ------------- analyze_data.py | 157 ---------------------------------------------- run_analysis.py | 68 -------------------- transform_data.py | 118 ---------------------------------- 5 files changed, 3 insertions(+), 386 deletions(-) delete mode 100644 analyze_data.py delete mode 100644 run_analysis.py delete mode 100644 transform_data.py diff --git a/.gitignore b/.gitignore index d747506..c34aeb4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ ./venv data/* -!data/.gitkeep \ No newline at end of file +!data/.gitkeep + +working_times.db \ No newline at end of file diff --git a/README.md b/README.md index e15bae2..3e06b78 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,6 @@ This project analyzes working time data from a CSV file by: 1. Importing the data into a DuckDB database -2. Transforming the data for analysis -3. Generating reports based on the data - ## Setup ### Dependencies @@ -29,42 +26,3 @@ This will: - Create a DuckDB database file `working_times.db` - Import the CSV data into a table - Add an import timestamp to each record - -### 2. Transform Data - -Run the transformation script to create analytical views: - -```bash -python3 transform_data.py -``` - -This will: -- Create summary tables with aggregated data -- Convert hours to days (using 8 hours = 1 day conversion) -- Add transformation timestamps - -### 3. Analyze Data - -Run the analysis script to generate reports: - -```bash -python3 analyze_data.py -``` - -This will produce: -- Overall time summary -- Top projects by hours -- Busiest days -- Day distribution analysis -- Project-activity combinations - -## Data Structure - -The analysis uses the following tables: - -- `working_times`: Raw imported data -- `working_times_summary`: Per-day, per-project aggregation -- `project_summary`: Total time per project -- `daily_summary`: Total time per day - -Each derived table includes timestamps for data lineage tracking. \ No newline at end of file diff --git a/analyze_data.py b/analyze_data.py deleted file mode 100644 index 5bed412..0000000 --- a/analyze_data.py +++ /dev/null @@ -1,157 +0,0 @@ -import duckdb -import pandas as pd -from datetime import datetime - -# Connect to the database -try: - con = duckdb.connect('working_times.db') - print("Connected to working_times.db") -except Exception as e: - print(f"Error connecting to database: {e}") - exit(1) - -# Get the current analysis timestamp -analysis_timestamp = datetime.now() - -# Function to format hours -def format_hours(hours): - return f"{hours:.2f}h" - -# Function to format days -def format_days(days): - return f"{days:.2f}d" - -# Get the date range of the data -date_range = con.execute(""" - SELECT MIN(date) AS start_date, MAX(date) AS end_date - FROM daily_summary -""").fetchone() - -start_date = date_range[0] -end_date = date_range[1] - -# Get the transformation timestamp (most recent) -transform_info = con.execute(""" - SELECT - MAX(transform_timestamp) AS transform_timestamp, - MAX(source_import_timestamp) AS source_import_timestamp - FROM daily_summary -""").fetchone() - -transform_timestamp = transform_info[0] -source_import_timestamp = transform_info[1] - -print("\n" + "="*60) -print(f"WORKING TIME ANALYSIS: {start_date} to {end_date}") -print(f"ANALYSIS TIMESTAMP: {analysis_timestamp}") -print(f"DATA TRANSFORMATION: {transform_timestamp}") -print(f"DATA IMPORT: {source_import_timestamp}") -print("="*60) - -# Get the total hours and days worked -totals = con.execute(""" - SELECT SUM(total_hours) AS total_hours, SUM(total_days) AS total_days - FROM project_summary -""").fetchone() - -total_hours = totals[0] -total_days = totals[1] -num_working_days = con.execute("SELECT COUNT(*) FROM daily_summary WHERE total_hours > 0").fetchone()[0] -avg_hours_per_day = total_hours / num_working_days if num_working_days > 0 else 0 - -print(f"\nTOTAL HOURS: {format_hours(total_hours)}") -print(f"TOTAL DAYS: {format_days(total_days)}") -print(f"WORKING DAYS: {num_working_days}") -print(f"AVG HOURS PER WORKING DAY: {format_hours(avg_hours_per_day)}") - -# Get the top projects by hours -top_projects = con.execute(""" - SELECT project_name, total_hours, total_days, days_worked - FROM project_summary - ORDER BY total_hours DESC - LIMIT 5 -""").fetchall() - -print("\n" + "-"*60) -print("TOP 5 PROJECTS BY HOURS") -print("-"*60) -for i, (project, hours, days, worked_days) in enumerate(top_projects, 1): - percent = (hours / total_hours) * 100 - print(f"{i}. {project}") - print(f" {format_hours(hours)} ({percent:.1f}% of total) / {format_days(days)}") - print(f" Across {worked_days} days, daily average: {format_hours(hours/worked_days) if worked_days > 0 else 0}") - -# Get the busiest days -busiest_days = con.execute(""" - SELECT date, total_hours, project_count - FROM daily_summary - WHERE total_hours > 0 - ORDER BY total_hours DESC - LIMIT 5 -""").fetchall() - -print("\n" + "-"*60) -print("TOP 5 BUSIEST DAYS") -print("-"*60) -for i, (date, hours, project_count) in enumerate(busiest_days, 1): - # Calculate day equivalent - day_equivalent = hours / 8 - print(f"{i}. {date}: {format_hours(hours)} ({format_days(day_equivalent)}) across {project_count} projects") - -# Get day distribution -day_distribution = con.execute(""" - SELECT - CASE - WHEN total_hours <= 4 THEN '0-4 hours' - WHEN total_hours <= 6 THEN '4-6 hours' - WHEN total_hours <= 8 THEN '6-8 hours' - WHEN total_hours <= 10 THEN '8-10 hours' - ELSE '10+ hours' - END AS hour_range, - COUNT(*) as day_count - FROM daily_summary - WHERE total_hours > 0 - GROUP BY hour_range - ORDER BY - CASE - WHEN hour_range = '0-4 hours' THEN 1 - WHEN hour_range = '4-6 hours' THEN 2 - WHEN hour_range = '6-8 hours' THEN 3 - WHEN hour_range = '8-10 hours' THEN 4 - ELSE 5 - END -""").fetchall() - -print("\n" + "-"*60) -print("DAY DISTRIBUTION") -print("-"*60) -for hour_range, day_count in day_distribution: - percent = (day_count / num_working_days) * 100 - print(f"{hour_range}: {day_count} days ({percent:.1f}%)") - -# Print an overview of project/activity combinations -project_activity_combo = con.execute(""" - SELECT - project_name, - activity_type, - SUM(total_hours) as hours, - SUM(total_days) as days - FROM working_times_summary - GROUP BY project_name, activity_type - ORDER BY hours DESC - LIMIT 10 -""").fetchall() - -print("\n" + "-"*60) -print("TOP 10 PROJECT-ACTIVITY COMBINATIONS") -print("-"*60) -for project, activity, hours, days in project_activity_combo: - percent = (hours / total_hours) * 100 - print(f"{project} - {activity}: {format_hours(hours)} ({format_days(days)}, {percent:.1f}%)") - -print("\n" + "="*60) -print(f"END OF ANALYSIS - Generated at {analysis_timestamp}") -print("="*60) - -# Close the connection -con.close() \ No newline at end of file diff --git a/run_analysis.py b/run_analysis.py deleted file mode 100644 index f0a9865..0000000 --- a/run_analysis.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -""" -Working Time Analysis - Complete Workflow -This script runs all three steps of the analysis in sequence: -1. Import the data -2. Transform the data -3. Generate analysis reports -""" - -import os -import sys -import subprocess -import time - -def run_step(script_name, step_desc): - """Run a step in the analysis and handle errors""" - print(f"\n{'='*60}") - print(f"STEP: {step_desc}") - print(f"{'='*60}") - - try: - # Run the script and capture output - result = subprocess.run( - [sys.executable, script_name], - capture_output=True, - text=True, - check=True - ) - print(result.stdout) - return True - except subprocess.CalledProcessError as e: - print(f"ERROR in {script_name}:") - print(e.stderr) - return False - -def main(): - # Check if the CSV file exists - csv_file = 'data/lawi-2025-04-01-2025-04-30-2025-04-17.csv' - if not os.path.exists(csv_file): - print(f"Error: CSV file not found at {csv_file}") - return False - - # Step 1: Import Data - if not run_step('import_data.py', 'IMPORTING DATA'): - return False - - # Wait a moment to ensure any file locks are released - time.sleep(1) - - # Step 2: Transform Data - if not run_step('transform_data.py', 'TRANSFORMING DATA'): - return False - - # Wait a moment to ensure any file locks are released - time.sleep(1) - - # Step 3: Analyze Data - if not run_step('analyze_data.py', 'ANALYZING DATA'): - return False - - print("\n" + "="*60) - print("ANALYSIS COMPLETE") - print("="*60) - return True - -if __name__ == "__main__": - success = main() - sys.exit(0 if success else 1) \ No newline at end of file diff --git a/transform_data.py b/transform_data.py deleted file mode 100644 index 7883204..0000000 --- a/transform_data.py +++ /dev/null @@ -1,118 +0,0 @@ -import duckdb -import pandas as pd -import os -import time -import datetime - -# Try to connect to the database with retry logic -max_retries = 5 -retry_count = 0 -connected = False - -print("Trying to connect to working_times.db...") - -while not connected and retry_count < max_retries: - try: - # Try to connect with 'access_mode=read_only' to avoid lock conflicts - con = duckdb.connect('working_times.db') - connected = True - print("Connected to working_times.db") - except Exception as e: - retry_count += 1 - print(f"Connection attempt {retry_count} failed: {e}") - if retry_count < max_retries: - print(f"Retrying in {retry_count} seconds...") - time.sleep(retry_count) - else: - print("Maximum retries reached. Exiting.") - exit(1) - -print("Transforming data...") - -# Get the transformation timestamp -transform_timestamp = datetime.datetime.now() -print(f"Transform timestamp: {transform_timestamp}") - -# Create a new table with transformed data -# This query will: -# 1. Extract date and project information -# 2. Calculate total hours per project per day -# 3. Format the data in a more analytical friendly way -con.execute(""" - DROP TABLE IF EXISTS working_times_summary; - CREATE TABLE working_times_summary AS - SELECT - Datum AS date, - Projektname AS project_name, - "Leistungsart (Bezeichnung)" AS activity_type, - SUM("Zeit [h]") AS total_hours, - SUM("Zeit [h]"/8) AS total_days, - '{transform_timestamp}' AS transform_timestamp - FROM working_times - GROUP BY date, project_name, activity_type - ORDER BY date, project_name, activity_type; -""".format(transform_timestamp=transform_timestamp)) - -# Create a table with project totals -con.execute(""" - DROP TABLE IF EXISTS project_summary; - CREATE TABLE project_summary AS - SELECT - Projektname AS project_name, - SUM("Zeit [h]") AS total_hours, - SUM("Zeit [h]"/8) AS total_days, - COUNT(DISTINCT Datum) AS days_worked, - MAX(import_timestamp) AS source_import_timestamp, - '{transform_timestamp}' AS transform_timestamp - FROM working_times - GROUP BY project_name - ORDER BY total_hours DESC; -""".format(transform_timestamp=transform_timestamp)) - -# Create a table with daily totals -con.execute(""" - DROP TABLE IF EXISTS daily_summary; - CREATE TABLE daily_summary AS - SELECT - Datum AS date, - SUM("Zeit [h]") AS total_hours, - COUNT(*) AS entry_count, - COUNT(DISTINCT Projektname) AS project_count, - MAX(import_timestamp) AS source_import_timestamp, - '{transform_timestamp}' AS transform_timestamp - FROM working_times - GROUP BY date - ORDER BY date; -""".format(transform_timestamp=transform_timestamp)) - -# Verify the data was transformed -summary_count = con.execute("SELECT COUNT(*) FROM working_times_summary").fetchone()[0] -project_count = con.execute("SELECT COUNT(*) FROM project_summary").fetchone()[0] -daily_count = con.execute("SELECT COUNT(*) FROM daily_summary").fetchone()[0] - -print(f"Successfully created {summary_count} records in working_times_summary table.") -print(f"Successfully created {project_count} records in project_summary table.") -print(f"Successfully created {daily_count} records in daily_summary table.") - -# Print a sample of the summary table -print("\nSample of working_times_summary table:") -summary_sample = con.execute("SELECT date, project_name, activity_type, total_hours, total_days, transform_timestamp FROM working_times_summary LIMIT 5").fetchall() -for row in summary_sample: - print(row) - -# Print a sample of the project summary table -print("\nProject summary (top 5 by hours):") -project_sample = con.execute("SELECT project_name, total_hours, total_days, days_worked, transform_timestamp FROM project_summary LIMIT 5").fetchall() -for row in project_sample: - print(row) - -# Total hours worked -total_hours = con.execute("SELECT SUM(total_hours) FROM project_summary").fetchone()[0] -total_days = con.execute("SELECT SUM(total_days) FROM project_summary").fetchone()[0] -print(f"\nTotal hours worked: {total_hours:.2f}") -print(f"Total days worked: {total_days:.2f}") - -# Close the connection -con.close() - -print("\nData transformation complete.") \ No newline at end of file