from selenium import webdriver import json import getpass #gets all basic information on the person which is synchronized with persis #saves them in a json format def get_persis_info(driver, name): driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name) #driver.get("file:///C:/Users/em/Downloads/Mrozek,%20Emily%20(em)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html") general = driver.find_element("id", "general-de") table = general.find_element("tag name", "tbody") job_description = general.find_element("xpath", "./p/span").text role = general.find_element("xpath", "./p/i").text email = table.find_element("xpath", "./tr[1]/td[2]/a/h4").text phone_number = table.find_element("xpath", "./tr[2]/td[2]/a/h4").text location = table.find_element("xpath", "./tr[4]/td[2]/h4").text business_area = table.find_element("xpath", "./tr[5]/td[2]/h4").text unit = table.find_element("xpath", "./tr[6]/td[2]/h4").text team = table.find_element("xpath", "./tr[7]/td[2]/h4").text supervisor = table.find_element("xpath", "./tr[8]/td[2]/a/h4").text persis_info = { "job_description": job_description, "role": role, "email": email, "phone number": phone_number, "location": location, "business area": business_area, "unit": unit, "team": team, "supervisor": supervisor } print(json.dumps(persis_info, indent=4)) return(persis_info) #gets additional information on the person which is added manually in each profile #saves them in a json format def get_additional_info(driver, name): #names = get_names() #for kuerzel in names: #look on page of specific person #print(kuerzel) driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name) #driver.get("file:///C:/Users/em/Downloads/Stender,%20Jakob%20(jste)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html") general = driver.find_element("id", "general-de") table = general.find_element("tag name", "tbody") try: language = table.find_element("xpath", "./tr[10]/td[2]/h4").text except: print("Languages could not be extracted") try: qualification = table.find_element("xpath", "./tr[11]/td[2]/h4").text except: print("Qualifications could not be extracted") try: duration = table.find_element("xpath", "./tr[12]/td[2]/h4").text except: print("Duration could not be extracted") additional_info = { "language": language, "qualification": qualification, "duration": duration } print(json.dumps(additional_info, indent=4)) return(additional_info) #gets information on the main focus of the person and their special skills #saves them in a json format def get_main_focus(driver, name): #names = get_names() #for kuerzel in names: #look on page of specific person #print(kuerzel) driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name) #driver.get("file:///C:/Users/em/Downloads/Schreiber,%20Philip%20(phs)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html") experience = { "consulting experience": [], "project management experience": [], "development experience": [], "skills": [] } emphasis_container = driver.find_element("id", "pane-emphasis-de") try: consulting = emphasis_container.find_element("xpath", "./div/div[1]") consulting_experience = get_content_as_list(consulting, "./div/div[2]", "li") experience["consulting experience"].append(consulting_experience) except: print("Consulting Experience could not be extracted") try: project_management = emphasis_container.find_element("xpath", "./div/div[2]") project_management_experience = get_content_as_list(project_management, "./div/div[2]", "li") experience["project management experience"].append(project_management_experience) except: print("Project Management Experience could not be extracted") try: development = emphasis_container.find_element("xpath", "./div/div[3]") development_experience = get_content_as_list(development, "./div/div[2]", "li") experience["development experience"].append(development_experience) except: print("Development Experience could not be extratced") try: skill_container = driver.find_element("id", "mainSkills-de") try: main_skill_1 = skill_container.find_element("xpath", "./h3[1]") main_skill_1_width= skill_container.find_element("xpath", "./div[2]/div").get_attribute("style") main_skill_1_score = main_skill_1_width[7:9] main_skill_1_set = {main_skill_1.text: int(main_skill_1_score)} experience["skills"].append(main_skill_1_set) except: print("Skill1 could not be extracted") try: main_skill_2 = skill_container.find_element("xpath", "./h3[2]") main_skill_2_width= skill_container.find_element("xpath", "./div[3]/div").get_attribute("style") main_skill_2_score = main_skill_2_width[7:9] main_skill_2_set = {main_skill_2.text: int(main_skill_2_score)} experience["skills"].append(main_skill_2_set) except: print("Skill2 could not be extracted") try: main_skill_3 = skill_container.find_element("xpath", "./h3[3]") main_skill_3_width= skill_container.find_element("xpath", "./div[4]/div").get_attribute("style") main_skill_3_score = main_skill_3_width[7:9] main_skill_3_set = {main_skill_3.text: int(main_skill_3_score)} experience["skills"].append(main_skill_3_set) except: print("Skill3 could not be extracted") print(json.dumps(experience, indent=4)) except: None return(experience) #gets information on the different skills #saves them in a json format def get_skills(driver, name): #names = get_names() driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name) #driver.get("file:///C:/Users/em/Downloads/Mrozek,%20Emily%20(em)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html") skills = {"technical": [], "field":[], "other":[]} skills_container = driver.find_element("id", "pane-skills-de") try: technical = skills_container.find_element("xpath", "./div/div[1]") dev_skills = get_content_as_list(technical, "./div[1]/div[2]", "li") languages = get_content_as_list(technical, "./div[2]/div[2]", "li") runtime_environments = get_content_as_list(technical, "./div[3]/div[2]", "li") operating_systems = get_content_as_list(technical, "./div[4]/div[2]", "li") database_management_systems= get_content_as_list(technical, "./div[5]/div[2]", "li") communication_and_networks = get_content_as_list(technical, "./div[6]/div[2]", "li") skill_dict = {"software development": dev_skills, "programming languages": languages, "runtime environments": runtime_environments, "operating systems": operating_systems, "database management systems": database_management_systems, "communication and networks": communication_and_networks} skills["technical"].append(skill_dict) print(json.dumps(skills, indent=4)) except: print("Technical Skills could not be extracted") try: field = skills_container.find_element("xpath", "./div/div[2]") industry_skills = get_content_as_list(field, "./div[1]/div[2]", "li") consulting = get_content_as_list(field, "./div[2]/div[2]", "li") skill_dict = {"industry and special skills": industry_skills, "consulting": consulting } skills["field"].append(skill_dict) print(json.dumps(skills, indent=4)) except: print("Field Skills could not be extracted") try: other_skills = get_content_as_list(skills_container, "./div/div[3]", "li") skills["other"].append(other_skills) print(json.dumps(skills, indent=4)) except: print("Other Skills could not be extracted") return(skills) #gets information on the publications and certifications of each person #saves them in a json format def get_life_events(driver, name): #names = get_names() driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name) #driver.get("file:///C:/Users/em/Downloads/Schreiber,%20Philip%20(phs)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html") life_event_record = { "certificates": [], "publications": [] } life_events = driver.find_element("id", "pane-lifeEvents-de") try: certificates = life_events.find_elements("class name", "certificate") for certificate in certificates: title_and_date = certificate.find_element("xpath", "./div[1]").text issuer = certificate.find_element("xpath", "./div[2]").text name = title_and_date[:-8] date = title_and_date[-7:] certification = {"name": name, "date": date, "issuer": issuer} life_event_record["certificates"].append(certification) print(json.dumps(life_event_record, indent=4)) except: print("Certificates could not be extracted") try: publications = life_events.find_elements("class name", "publication") for publication in publications: title_and_date = publication.find_element("xpath", "./div[1]").text publisher = publication.find_element("xpath", "./div[2]").text title = title_and_date[:-8] date = title_and_date[-7:] pub = {"title": title, "date": date, "publisher": publisher} life_event_record["publications"].append(pub) print(json.dumps(life_event_record, indent=4)) except: print("Publications could not be extracted") return(life_event_record) #gets information on all the projects and the related tasks of each person #saves them in a json format def get_projects(driver, name): #names = get_names() driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name) #driver.get("file:///C:/Users/em/Downloads/Seifert,%20Raphael%20(rse)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html") project_table = driver.find_element("id", "pane-projects-de") projects = project_table.find_elements("tag name", "table") all_projects = {"project": []} try: for project in projects: period = project.find_element("class name", "projectPeriod").text title = project.find_element("class name", "projectTitle").text role = project.find_element("class name", "projectRole").text client = project.find_element("class name", "projectClient").text department = project.find_element("class name", "projectDepartment").text description = project.find_element("class name", "projectDescription").text tasks = get_content_as_list(project, "./tbody/tr/td[2]", "li") proj = { "period": period, "title": title, "role": role, "client": client, "department": department, "description": description, "tasks": tasks } all_projects["project"].append(proj) print(json.dumps(all_projects, indent=4)) except: print("Projects could not be extracted") return(all_projects) #extracts content from a certain tag and saves it in a list def get_content_as_list(driver, relative_content_path, type): content_div = driver.find_element("xpath", relative_content_path) list_elements = content_div.find_elements("tag name",type) element_list = [] for item in list_elements: element_list.append(item.text) return(element_list) if __name__ == "__main__": with webdriver.Chrome() as driver: # manuel Login driver.get("https://confluence.ppi.de") getpass.getpass("Press Enter after You are done logging in") kuerzel = 'lawi' #get_persis_info(driver, kuerzel) #get_additional_info(driver, kuerzel) #get_main_focus(driver, kuerzel) get_skills(driver, kuerzel) #get_life_events(driver, kuerzel) #get_projects(driver, kuerzel)