Upload files to "source/Confluence"

2025-04-01 08:46:03 +00:00
parent 68f7bd31ad
commit fe28536b7b
3 changed files with 594 additions and 0 deletions
--- a/source/Confluence/Confluence.py
+++ b/source/Confluence/Confluence.py
@@ -0,0 +1,355 @@
+from selenium import webdriver
+import json
+import getpass
+
+#gets all basic information on the person which is synchronized with persis
+#saves them in a json format
+def get_persis_info(driver, name):
+    driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
+    #driver.get("file:///C:/Users/em/Downloads/Mrozek,%20Emily%20(em)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
+
+    general = driver.find_element("id", "general-de")
+    table = general.find_element("tag name", "tbody")
+
+    job_description = general.find_element("xpath", "./p/span").text 
+    
+    role = general.find_element("xpath", "./p/i").text
+
+    email = table.find_element("xpath", "./tr[1]/td[2]/a/h4").text
+    
+    phone_number = table.find_element("xpath", "./tr[2]/td[2]/a/h4").text
+
+    location = table.find_element("xpath", "./tr[4]/td[2]/h4").text
+
+    business_area = table.find_element("xpath", "./tr[5]/td[2]/h4").text
+
+    unit = table.find_element("xpath", "./tr[6]/td[2]/h4").text
+
+    team = table.find_element("xpath", "./tr[7]/td[2]/h4").text
+
+    supervisor = table.find_element("xpath", "./tr[8]/td[2]/a/h4").text
+    
+    persis_info = {
+        "job_description": job_description,
+        "role": role,
+        "email": email,
+        "phone number": phone_number,
+        "location": location,
+        "business area": business_area,
+        "unit": unit,
+        "team": team,
+        "supervisor": supervisor
+    }
+
+    print(json.dumps(persis_info, indent=4))
+    return(persis_info)
+
+#gets additional information on the person which is added manually in each profile
+#saves them in a json format
+def get_additional_info(driver, name):
+    
+    #names = get_names()
+     #for kuerzel in names:
+        #look on page of specific person
+        #print(kuerzel)
+    driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
+        #driver.get("file:///C:/Users/em/Downloads/Stender,%20Jakob%20(jste)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
+
+    general = driver.find_element("id", "general-de")
+    table = general.find_element("tag name", "tbody")
+
+    try:
+        language = table.find_element("xpath", "./tr[10]/td[2]/h4").text
+    except:
+        print("Languages could not be extracted")
+
+    try:
+        qualification = table.find_element("xpath", "./tr[11]/td[2]/h4").text
+    except:
+        print("Qualifications could not be extracted")
+
+    try:
+        duration = table.find_element("xpath", "./tr[12]/td[2]/h4").text
+    except:
+        print("Duration could not be extracted")
+    
+    additional_info = {
+        "language": language,
+        "qualification": qualification,
+        "duration": duration
+    }
+
+    print(json.dumps(additional_info, indent=4))
+    return(additional_info)
+
+#gets information on the main focus of the person and their special skills
+#saves them in a json format
+def get_main_focus(driver, name):
+    #names = get_names()
+     #for kuerzel in names:
+        #look on page of specific person
+        #print(kuerzel)
+    driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
+    #driver.get("file:///C:/Users/em/Downloads/Schreiber,%20Philip%20(phs)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
+
+    experience = {
+    "consulting experience": [],
+    "project management experience": [],
+    "development experience": [],
+    "skills": []
+    }
+
+    emphasis_container = driver.find_element("id", "pane-emphasis-de")
+
+    try:
+        consulting = emphasis_container.find_element("xpath", "./div/div[1]")
+        consulting_experience = get_content_as_list(consulting, "./div/div[2]", "li")
+
+        experience["consulting experience"].append(consulting_experience)
+        
+    except:
+        print("Consulting Experience could not be extracted")
+
+    try:
+        project_management = emphasis_container.find_element("xpath", "./div/div[2]")
+        project_management_experience = get_content_as_list(project_management, "./div/div[2]", "li")
+
+        experience["project management experience"].append(project_management_experience)
+
+    except:
+        print("Project Management Experience could not be extracted")
+
+    try:
+        development = emphasis_container.find_element("xpath", "./div/div[3]")
+        development_experience = get_content_as_list(development, "./div/div[2]", "li")
+
+        experience["development experience"].append(development_experience)
+    except:
+        print("Development Experience could not be extratced")
+
+    try:
+        skill_container = driver.find_element("id", "mainSkills-de")
+
+        try:
+            main_skill_1 = skill_container.find_element("xpath", "./h3[1]")
+            main_skill_1_width= skill_container.find_element("xpath", "./div[2]/div").get_attribute("style")
+            main_skill_1_score = main_skill_1_width[7:9]
+            main_skill_1_set = {main_skill_1.text: int(main_skill_1_score)}
+            experience["skills"].append(main_skill_1_set)
+        except:
+            print("Skill1 could not be extracted")
+
+        try:
+            main_skill_2 = skill_container.find_element("xpath", "./h3[2]")
+            main_skill_2_width= skill_container.find_element("xpath", "./div[3]/div").get_attribute("style")
+            main_skill_2_score = main_skill_2_width[7:9]
+            main_skill_2_set = {main_skill_2.text: int(main_skill_2_score)}
+            experience["skills"].append(main_skill_2_set)
+        except:
+            print("Skill2 could not be extracted")
+
+
+        try:
+            main_skill_3 = skill_container.find_element("xpath", "./h3[3]")
+            main_skill_3_width= skill_container.find_element("xpath", "./div[4]/div").get_attribute("style")
+            main_skill_3_score = main_skill_3_width[7:9]
+            main_skill_3_set = {main_skill_3.text: int(main_skill_3_score)}
+            experience["skills"].append(main_skill_3_set)
+        except:
+            print("Skill3 could not be extracted")
+
+
+        print(json.dumps(experience, indent=4))
+    except:
+        None
+
+    return(experience)
+
+#gets information on the different skills
+#saves them in a json format
+def get_skills(driver, name):
+    #names = get_names()
+    driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
+    #driver.get("file:///C:/Users/em/Downloads/Mrozek,%20Emily%20(em)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
+
+    skills = {"technical": [],
+              "field":[],
+              "other":[]}
+    
+    skills_container = driver.find_element("id", "pane-skills-de")
+
+    try:
+        technical = skills_container.find_element("xpath", "./div/div[1]")
+
+        dev_skills = get_content_as_list(technical, "./div[1]/div[2]", "li")
+
+        languages = get_content_as_list(technical, "./div[2]/div[2]", "li")
+
+        runtime_environments = get_content_as_list(technical, "./div[3]/div[2]", "li")
+        
+        operating_systems = get_content_as_list(technical, "./div[4]/div[2]", "li")
+
+        database_management_systems= get_content_as_list(technical, "./div[5]/div[2]", "li")
+
+        communication_and_networks = get_content_as_list(technical, "./div[6]/div[2]", "li")
+
+        skill_dict = {"software development": dev_skills,
+                      "programming languages": languages,
+                      "runtime environments": runtime_environments,
+                      "operating systems": operating_systems,
+                      "database management systems": database_management_systems,
+                      "communication and networks": communication_and_networks}
+        skills["technical"].append(skill_dict)
+        print(json.dumps(skills, indent=4))
+    except:
+        print("Technical Skills could not be extracted")
+
+    try: 
+        field = skills_container.find_element("xpath", "./div/div[2]")
+
+        industry_skills = get_content_as_list(field, "./div[1]/div[2]", "li")
+
+        consulting = get_content_as_list(field, "./div[2]/div[2]", "li")
+
+
+        skill_dict = {"industry and special skills": industry_skills,
+                      "consulting": consulting
+                      }
+        skills["field"].append(skill_dict)
+        print(json.dumps(skills, indent=4))
+    except:
+        print("Field Skills could not be extracted")
+
+    try: 
+        other_skills = get_content_as_list(skills_container, "./div/div[3]", "li")
+
+        skills["other"].append(other_skills)
+        print(json.dumps(skills, indent=4))
+    except:
+        print("Other Skills could not be extracted")
+    
+    return(skills)
+
+#gets information on the publications and certifications of each person
+#saves them in a json format
+def get_life_events(driver, name):
+    #names = get_names()
+    driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
+    #driver.get("file:///C:/Users/em/Downloads/Schreiber,%20Philip%20(phs)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
+
+    life_event_record = {
+        "certificates": [],
+        "publications": []
+    }
+    life_events = driver.find_element("id", "pane-lifeEvents-de")
+
+    try:
+        certificates = life_events.find_elements("class name", "certificate")
+
+        for certificate in certificates:
+            title_and_date = certificate.find_element("xpath", "./div[1]").text
+            issuer = certificate.find_element("xpath", "./div[2]").text
+            name = title_and_date[:-8]
+            date = title_and_date[-7:]
+
+            certification = {"name": name,
+                             "date": date,
+                             "issuer": issuer}
+
+            life_event_record["certificates"].append(certification)
+
+        print(json.dumps(life_event_record, indent=4))
+
+    except:
+        print("Certificates could not be extracted")
+    
+    try:
+        publications = life_events.find_elements("class name", "publication")
+
+        for publication in publications:
+            title_and_date = publication.find_element("xpath", "./div[1]").text
+            publisher = publication.find_element("xpath", "./div[2]").text
+            title = title_and_date[:-8]
+            date = title_and_date[-7:]
+
+            pub = {"title": title,
+                             "date": date,
+                             "publisher": publisher}
+
+            life_event_record["publications"].append(pub)
+
+        print(json.dumps(life_event_record, indent=4))
+
+    except:
+        print("Publications could not be extracted")
+    return(life_event_record)
+
+#gets information on all the projects and the related tasks of each person
+#saves them in a json format
+def get_projects(driver, name):
+    #names = get_names()
+    driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
+    #driver.get("file:///C:/Users/em/Downloads/Seifert,%20Raphael%20(rse)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
+
+    project_table = driver.find_element("id", "pane-projects-de")
+    projects = project_table.find_elements("tag name", "table")
+
+    all_projects = {"project": []}
+
+    try:
+        for project in projects:
+            period = project.find_element("class name", "projectPeriod").text
+            title = project.find_element("class name", "projectTitle").text
+            role = project.find_element("class name", "projectRole").text
+            client = project.find_element("class name", "projectClient").text
+            department = project.find_element("class name", "projectDepartment").text
+            description = project.find_element("class name", "projectDescription").text
+            tasks = get_content_as_list(project, "./tbody/tr/td[2]", "li")
+
+            proj = {
+                "period": period,
+                "title": title,
+                "role": role,
+                "client": client,
+                "department": department,
+                "description": description,
+                "tasks": tasks
+            }
+
+            all_projects["project"].append(proj)
+
+        print(json.dumps(all_projects, indent=4))
+
+    except:
+        print("Projects could not be extracted")
+
+    return(all_projects)
+
+
+#extracts content from a certain tag and saves it in a list
+def get_content_as_list(driver, relative_content_path, type):
+
+    content_div = driver.find_element("xpath", relative_content_path)
+    list_elements = content_div.find_elements("tag name",type)
+    element_list = []
+    
+    for item in list_elements:
+        element_list.append(item.text)
+
+    return(element_list)
+
+
+if __name__ == "__main__":
+    with  webdriver.Chrome() as driver:
+        # manuel Login
+        driver.get("https://confluence.ppi.de")
+        getpass.getpass("Press Enter after You are done logging in")
+
+        kuerzel = 'lawi'
+        
+        #get_persis_info(driver, kuerzel)
+        #get_additional_info(driver, kuerzel)
+        #get_main_focus(driver, kuerzel)
+        get_skills(driver, kuerzel)
+        #get_life_events(driver, kuerzel)
+        #get_projects(driver, kuerzel)
--- a/source/Confluence/json_to_df.py
+++ b/source/Confluence/json_to_df.py
@@ -0,0 +1,130 @@
+import json
+import os
+import pandas as pd
+
+
+def convert_json_to_dataframe() -> None:
+    directory = "source\Data"
+
+    df_all_global_info = pd.DataFrame()
+    df_all_languages = pd.DataFrame()
+    df_all_main_skills = pd.DataFrame()
+    df_all_skills = pd.DataFrame()
+    df_all_projects = pd.DataFrame()
+    df_all_project_tasks = pd.DataFrame()
+    df_all_emphasis = pd.DataFrame()
+    df_all_publications = pd.DataFrame()
+    df_all_certificates = pd.DataFrame()
+
+    for entry in os.scandir(directory):
+        if entry.is_file() and entry.name.endswith(".json"):
+            path = entry.path
+            short = entry.name.removesuffix(".json")
+
+        with open(path, "r") as f:
+            data = json.loads(f.read())
+
+        # collects all global information on a person
+        df_global_info = pd.json_normalize(data.get("global", []))
+        df_global_info["graduation"] = data.get("de").get("graduation")
+        df_global_info["career"] = data.get("de").get("career")
+        if hasattr(df_global_info, "yearOfBirth"):
+            df_global_info = df_global_info.drop(["yearOfBirth"], axis=1)
+        df_global_info["short"] = short
+        df_all_global_info = pd.concat([df_global_info, df_all_global_info])
+
+        # collects all the languages a person speaks
+        df_languages = pd.DataFrame(data["de"]["languages"], columns=["language"])
+        df_languages["short"] = short
+        df_all_languages = pd.concat([df_languages, df_all_languages])
+
+        # collects the main skills and the according scores
+        df_main_skills = pd.json_normalize(data["de"]["mainSkills"])
+        try:
+            main_skill1 = df_main_skills[["mainSkill1Title", "mainSkill1Quality"]]
+            main_skill1 = main_skill1.rename(
+                {"mainSkill1Title": "skillName", "mainSkill1Quality": "skillLevel"},
+                axis=1,
+            )
+        except:
+            main_skill1 = pd.DataFrame()
+        try:
+            main_skill2 = df_main_skills[["mainSkill2Title", "mainSkill2Quality"]]
+            main_skill2 = main_skill2.rename(
+                {"mainSkill2Title": "skillName", "mainSkill2Quality": "skillLevel"},
+                axis=1,
+            )
+        except:
+            main_skill2 = pd.DataFrame()
+        try:
+            main_skill3 = df_main_skills[["mainSkill3Title", "mainSkill3Quality"]]
+            main_skill3 = main_skill3.rename(
+                {"mainSkill3Title": "skillName", "mainSkill3Quality": "skillLevel"},
+                axis=1,
+            )
+        except:
+            main_skill3 = pd.DataFrame()
+        df_main_skills = pd.concat([main_skill1, main_skill2, main_skill3])
+        df_main_skills["short"] = short
+        df_all_main_skills = pd.concat([df_main_skills, df_all_main_skills])
+
+        # collects all skills by skill type
+        df_skill_all = pd.json_normalize(data.get("de", []).get("skills", []))
+
+        df_skills = (
+            pd.DataFrame(df_skill_all.items(), columns=["skillType", "skill"])
+            .explode("skill")
+            .explode("skill")
+        )
+        df_skills["short"] = short
+        df_all_skills = pd.concat([df_skills, df_all_skills])
+
+        # collects all projects
+        df_projects = pd.json_normalize(data.get("de", []).get("projects", []))
+        if hasattr(df_projects, "favorite"):
+            df_projects = df_projects.drop(["favorite"], axis=1)
+        if hasattr(df_projects, "tasks"):
+            df_projects = df_projects.drop(["tasks"], axis=1)
+        df_projects["short"] = short
+        df_all_projects = pd.concat([df_projects, df_all_projects])
+
+        # collects all tasks per project and person
+        df_project_tasks = pd.json_normalize(
+            data.get("de", []).get("projects", []),
+            record_path=["tasks"],
+            meta=["title"],
+        )
+        df_project_tasks.rename({0: "task"}, axis=1, inplace=True)
+        df_project_tasks["short"] = short
+        df_all_project_tasks = pd.concat([df_project_tasks, df_all_project_tasks])
+
+        # collects all tasks per main emphasis
+        df_emphasis_all = pd.json_normalize(data.get("de", []).get("emphasis", []))
+        df_emphasis = (
+            pd.DataFrame(df_emphasis_all.items(), columns=["emphasis", "skill"])
+            .explode("skill")
+            .explode("skill")
+        )
+        df_emphasis["short"] = short
+        df_all_emphasis = pd.concat([df_emphasis, df_all_emphasis])
+
+        # collects all publications from a person
+        # if there are no publications, an empty list is created
+        df_publications = pd.json_normalize(data.get("de", []).get("publications", []))
+        df_publications["short"] = short
+        df_all_publications = pd.concat([df_publications, df_all_publications])
+
+        # collects all certificates a person has
+        df_certificates = pd.json_normalize(data.get("de", []).get("certificates", []))
+        df_certificates["short"] = short
+        df_all_certificates = pd.concat([df_certificates, df_all_certificates])
+
+    df_all_global_info.to_csv("source\Tables\global_info.csv")
+    df_all_languages.to_csv("source\Tables\languages.csv")
+    df_all_main_skills.to_csv("source\Tables\main_skills.csv")
+    df_all_skills.to_csv("source\Tables\skills.csv")
+    df_all_projects.to_csv("source\Tables\projects.csv")
+    df_all_project_tasks.to_csv("source\Tables\project_tasks.csv")
+    df_all_emphasis.to_csv("source\Tables\emphasis.csv")
+    df_all_publications.to_csv("source\Tables\publications.csv")
+    df_all_certificates.to_csv("source\Tables\certificates.csv")
--- a/source/Confluence/test.ipynb
+++ b/source/Confluence/test.ipynb
@@ -0,0 +1,109 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from selenium import webdriver\n",
+    "import json\n",
+    "import getpass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "name = 'lawi'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "    \"job_description\": \"Senior Consultant (L2201)\",\n",
+      "    \"role\": \"Senior Consultant\",\n",
+      "    \"email\": \"Lasse.Wiedemann@crossnative.com\",\n",
+      "    \"phone number\": \"+4943188810-0\",\n",
+      "    \"location\": \"GS Hamburg\",\n",
+      "    \"business area\": \"GB PPI-X\",\n",
+      "    \"unit\": \"X-Consulting, sada\",\n",
+      "    \"team\": \"CX, sada, Team DnA olsc\",\n",
+      "    \"supervisor\": \"Ole Schmidt (olsc)\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "driver = webdriver.Chrome()\n",
+    "driver.get(\"https://confluence.ppi.de\")\n",
+    "getpass.getpass(\"Press Enter after You are done logging in\")\n",
+    "driver.get(\"https://confluence.ppi.de/masterprofiles/viewprofile.action?username=\" + name)\n",
+    "general = driver.find_element(\"id\", \"general-de\")\n",
+    "table = general.find_element(\"tag name\", \"tbody\")\n",
+    "job_description = general.find_element(\"xpath\", \"./p/span\").text \n",
+    "role = general.find_element(\"xpath\", \"./p/i\").text\n",
+    "email = table.find_element(\"xpath\", \"./tr[1]/td[2]/a/h4\").text\n",
+    "phone_number = table.find_element(\"xpath\", \"./tr[2]/td[2]/a/h4\").text\n",
+    "location = table.find_element(\"xpath\", \"./tr[4]/td[2]/h4\").text\n",
+    "business_area = table.find_element(\"xpath\", \"./tr[5]/td[2]/h4\").text\n",
+    "unit = table.find_element(\"xpath\", \"./tr[6]/td[2]/h4\").text\n",
+    "team = table.find_element(\"xpath\", \"./tr[7]/td[2]/h4\").text\n",
+    "supervisor = table.find_element(\"xpath\", \"./tr[8]/td[2]/a/h4\").text\n",
+    "persis_info = {\n",
+    "    \"job_description\": job_description,\n",
+    "    \"role\": role,\n",
+    "    \"email\": email,\n",
+    "    \"phone number\": phone_number,\n",
+    "    \"location\": location,\n",
+    "    \"business area\": business_area,\n",
+    "    \"unit\": unit,\n",
+    "    \"team\": team,\n",
+    "    \"supervisor\": supervisor\n",
+    "}\n",
+    "\n",
+    "print(json.dumps(persis_info, indent=4))\n",
+    "\n",
+    "driver.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}