EmployeeDB/source/Confluence/json_to_df.py

131 lines
5.4 KiB
Python

import json
import os
import pandas as pd
def convert_json_to_dataframe() -> None:
directory = "source\Data"
df_all_global_info = pd.DataFrame()
df_all_languages = pd.DataFrame()
df_all_main_skills = pd.DataFrame()
df_all_skills = pd.DataFrame()
df_all_projects = pd.DataFrame()
df_all_project_tasks = pd.DataFrame()
df_all_emphasis = pd.DataFrame()
df_all_publications = pd.DataFrame()
df_all_certificates = pd.DataFrame()
for entry in os.scandir(directory):
if entry.is_file() and entry.name.endswith(".json"):
path = entry.path
short = entry.name.removesuffix(".json")
with open(path, "r") as f:
data = json.loads(f.read())
# collects all global information on a person
df_global_info = pd.json_normalize(data.get("global", []))
df_global_info["graduation"] = data.get("de").get("graduation")
df_global_info["career"] = data.get("de").get("career")
if hasattr(df_global_info, "yearOfBirth"):
df_global_info = df_global_info.drop(["yearOfBirth"], axis=1)
df_global_info["short"] = short
df_all_global_info = pd.concat([df_global_info, df_all_global_info])
# collects all the languages a person speaks
df_languages = pd.DataFrame(data["de"]["languages"], columns=["language"])
df_languages["short"] = short
df_all_languages = pd.concat([df_languages, df_all_languages])
# collects the main skills and the according scores
df_main_skills = pd.json_normalize(data["de"]["mainSkills"])
try:
main_skill1 = df_main_skills[["mainSkill1Title", "mainSkill1Quality"]]
main_skill1 = main_skill1.rename(
{"mainSkill1Title": "skillName", "mainSkill1Quality": "skillLevel"},
axis=1,
)
except:
main_skill1 = pd.DataFrame()
try:
main_skill2 = df_main_skills[["mainSkill2Title", "mainSkill2Quality"]]
main_skill2 = main_skill2.rename(
{"mainSkill2Title": "skillName", "mainSkill2Quality": "skillLevel"},
axis=1,
)
except:
main_skill2 = pd.DataFrame()
try:
main_skill3 = df_main_skills[["mainSkill3Title", "mainSkill3Quality"]]
main_skill3 = main_skill3.rename(
{"mainSkill3Title": "skillName", "mainSkill3Quality": "skillLevel"},
axis=1,
)
except:
main_skill3 = pd.DataFrame()
df_main_skills = pd.concat([main_skill1, main_skill2, main_skill3])
df_main_skills["short"] = short
df_all_main_skills = pd.concat([df_main_skills, df_all_main_skills])
# collects all skills by skill type
df_skill_all = pd.json_normalize(data.get("de", []).get("skills", []))
df_skills = (
pd.DataFrame(df_skill_all.items(), columns=["skillType", "skill"])
.explode("skill")
.explode("skill")
)
df_skills["short"] = short
df_all_skills = pd.concat([df_skills, df_all_skills])
# collects all projects
df_projects = pd.json_normalize(data.get("de", []).get("projects", []))
if hasattr(df_projects, "favorite"):
df_projects = df_projects.drop(["favorite"], axis=1)
if hasattr(df_projects, "tasks"):
df_projects = df_projects.drop(["tasks"], axis=1)
df_projects["short"] = short
df_all_projects = pd.concat([df_projects, df_all_projects])
# collects all tasks per project and person
df_project_tasks = pd.json_normalize(
data.get("de", []).get("projects", []),
record_path=["tasks"],
meta=["title"],
)
df_project_tasks.rename({0: "task"}, axis=1, inplace=True)
df_project_tasks["short"] = short
df_all_project_tasks = pd.concat([df_project_tasks, df_all_project_tasks])
# collects all tasks per main emphasis
df_emphasis_all = pd.json_normalize(data.get("de", []).get("emphasis", []))
df_emphasis = (
pd.DataFrame(df_emphasis_all.items(), columns=["emphasis", "skill"])
.explode("skill")
.explode("skill")
)
df_emphasis["short"] = short
df_all_emphasis = pd.concat([df_emphasis, df_all_emphasis])
# collects all publications from a person
# if there are no publications, an empty list is created
df_publications = pd.json_normalize(data.get("de", []).get("publications", []))
df_publications["short"] = short
df_all_publications = pd.concat([df_publications, df_all_publications])
# collects all certificates a person has
df_certificates = pd.json_normalize(data.get("de", []).get("certificates", []))
df_certificates["short"] = short
df_all_certificates = pd.concat([df_certificates, df_all_certificates])
df_all_global_info.to_csv("source\Tables\global_info.csv")
df_all_languages.to_csv("source\Tables\languages.csv")
df_all_main_skills.to_csv("source\Tables\main_skills.csv")
df_all_skills.to_csv("source\Tables\skills.csv")
df_all_projects.to_csv("source\Tables\projects.csv")
df_all_project_tasks.to_csv("source\Tables\project_tasks.csv")
df_all_emphasis.to_csv("source\Tables\emphasis.csv")
df_all_publications.to_csv("source\Tables\publications.csv")
df_all_certificates.to_csv("source\Tables\certificates.csv")