355 lines
13 KiB
Python
355 lines
13 KiB
Python
from selenium import webdriver
|
|
import json
|
|
import getpass
|
|
|
|
#gets all basic information on the person which is synchronized with persis
|
|
#saves them in a json format
|
|
def get_persis_info(driver, name):
|
|
driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
|
|
#driver.get("file:///C:/Users/em/Downloads/Mrozek,%20Emily%20(em)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
|
|
|
|
general = driver.find_element("id", "general-de")
|
|
table = general.find_element("tag name", "tbody")
|
|
|
|
job_description = general.find_element("xpath", "./p/span").text
|
|
|
|
role = general.find_element("xpath", "./p/i").text
|
|
|
|
email = table.find_element("xpath", "./tr[1]/td[2]/a/h4").text
|
|
|
|
phone_number = table.find_element("xpath", "./tr[2]/td[2]/a/h4").text
|
|
|
|
location = table.find_element("xpath", "./tr[4]/td[2]/h4").text
|
|
|
|
business_area = table.find_element("xpath", "./tr[5]/td[2]/h4").text
|
|
|
|
unit = table.find_element("xpath", "./tr[6]/td[2]/h4").text
|
|
|
|
team = table.find_element("xpath", "./tr[7]/td[2]/h4").text
|
|
|
|
supervisor = table.find_element("xpath", "./tr[8]/td[2]/a/h4").text
|
|
|
|
persis_info = {
|
|
"job_description": job_description,
|
|
"role": role,
|
|
"email": email,
|
|
"phone number": phone_number,
|
|
"location": location,
|
|
"business area": business_area,
|
|
"unit": unit,
|
|
"team": team,
|
|
"supervisor": supervisor
|
|
}
|
|
|
|
print(json.dumps(persis_info, indent=4))
|
|
return(persis_info)
|
|
|
|
#gets additional information on the person which is added manually in each profile
|
|
#saves them in a json format
|
|
def get_additional_info(driver, name):
|
|
|
|
#names = get_names()
|
|
#for kuerzel in names:
|
|
#look on page of specific person
|
|
#print(kuerzel)
|
|
driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
|
|
#driver.get("file:///C:/Users/em/Downloads/Stender,%20Jakob%20(jste)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
|
|
|
|
general = driver.find_element("id", "general-de")
|
|
table = general.find_element("tag name", "tbody")
|
|
|
|
try:
|
|
language = table.find_element("xpath", "./tr[10]/td[2]/h4").text
|
|
except:
|
|
print("Languages could not be extracted")
|
|
|
|
try:
|
|
qualification = table.find_element("xpath", "./tr[11]/td[2]/h4").text
|
|
except:
|
|
print("Qualifications could not be extracted")
|
|
|
|
try:
|
|
duration = table.find_element("xpath", "./tr[12]/td[2]/h4").text
|
|
except:
|
|
print("Duration could not be extracted")
|
|
|
|
additional_info = {
|
|
"language": language,
|
|
"qualification": qualification,
|
|
"duration": duration
|
|
}
|
|
|
|
print(json.dumps(additional_info, indent=4))
|
|
return(additional_info)
|
|
|
|
#gets information on the main focus of the person and their special skills
|
|
#saves them in a json format
|
|
def get_main_focus(driver, name):
|
|
#names = get_names()
|
|
#for kuerzel in names:
|
|
#look on page of specific person
|
|
#print(kuerzel)
|
|
driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
|
|
#driver.get("file:///C:/Users/em/Downloads/Schreiber,%20Philip%20(phs)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
|
|
|
|
experience = {
|
|
"consulting experience": [],
|
|
"project management experience": [],
|
|
"development experience": [],
|
|
"skills": []
|
|
}
|
|
|
|
emphasis_container = driver.find_element("id", "pane-emphasis-de")
|
|
|
|
try:
|
|
consulting = emphasis_container.find_element("xpath", "./div/div[1]")
|
|
consulting_experience = get_content_as_list(consulting, "./div/div[2]", "li")
|
|
|
|
experience["consulting experience"].append(consulting_experience)
|
|
|
|
except:
|
|
print("Consulting Experience could not be extracted")
|
|
|
|
try:
|
|
project_management = emphasis_container.find_element("xpath", "./div/div[2]")
|
|
project_management_experience = get_content_as_list(project_management, "./div/div[2]", "li")
|
|
|
|
experience["project management experience"].append(project_management_experience)
|
|
|
|
except:
|
|
print("Project Management Experience could not be extracted")
|
|
|
|
try:
|
|
development = emphasis_container.find_element("xpath", "./div/div[3]")
|
|
development_experience = get_content_as_list(development, "./div/div[2]", "li")
|
|
|
|
experience["development experience"].append(development_experience)
|
|
except:
|
|
print("Development Experience could not be extratced")
|
|
|
|
try:
|
|
skill_container = driver.find_element("id", "mainSkills-de")
|
|
|
|
try:
|
|
main_skill_1 = skill_container.find_element("xpath", "./h3[1]")
|
|
main_skill_1_width= skill_container.find_element("xpath", "./div[2]/div").get_attribute("style")
|
|
main_skill_1_score = main_skill_1_width[7:9]
|
|
main_skill_1_set = {main_skill_1.text: int(main_skill_1_score)}
|
|
experience["skills"].append(main_skill_1_set)
|
|
except:
|
|
print("Skill1 could not be extracted")
|
|
|
|
try:
|
|
main_skill_2 = skill_container.find_element("xpath", "./h3[2]")
|
|
main_skill_2_width= skill_container.find_element("xpath", "./div[3]/div").get_attribute("style")
|
|
main_skill_2_score = main_skill_2_width[7:9]
|
|
main_skill_2_set = {main_skill_2.text: int(main_skill_2_score)}
|
|
experience["skills"].append(main_skill_2_set)
|
|
except:
|
|
print("Skill2 could not be extracted")
|
|
|
|
|
|
try:
|
|
main_skill_3 = skill_container.find_element("xpath", "./h3[3]")
|
|
main_skill_3_width= skill_container.find_element("xpath", "./div[4]/div").get_attribute("style")
|
|
main_skill_3_score = main_skill_3_width[7:9]
|
|
main_skill_3_set = {main_skill_3.text: int(main_skill_3_score)}
|
|
experience["skills"].append(main_skill_3_set)
|
|
except:
|
|
print("Skill3 could not be extracted")
|
|
|
|
|
|
print(json.dumps(experience, indent=4))
|
|
except:
|
|
None
|
|
|
|
return(experience)
|
|
|
|
#gets information on the different skills
|
|
#saves them in a json format
|
|
def get_skills(driver, name):
|
|
#names = get_names()
|
|
driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
|
|
#driver.get("file:///C:/Users/em/Downloads/Mrozek,%20Emily%20(em)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
|
|
|
|
skills = {"technical": [],
|
|
"field":[],
|
|
"other":[]}
|
|
|
|
skills_container = driver.find_element("id", "pane-skills-de")
|
|
|
|
try:
|
|
technical = skills_container.find_element("xpath", "./div/div[1]")
|
|
|
|
dev_skills = get_content_as_list(technical, "./div[1]/div[2]", "li")
|
|
|
|
languages = get_content_as_list(technical, "./div[2]/div[2]", "li")
|
|
|
|
runtime_environments = get_content_as_list(technical, "./div[3]/div[2]", "li")
|
|
|
|
operating_systems = get_content_as_list(technical, "./div[4]/div[2]", "li")
|
|
|
|
database_management_systems= get_content_as_list(technical, "./div[5]/div[2]", "li")
|
|
|
|
communication_and_networks = get_content_as_list(technical, "./div[6]/div[2]", "li")
|
|
|
|
skill_dict = {"software development": dev_skills,
|
|
"programming languages": languages,
|
|
"runtime environments": runtime_environments,
|
|
"operating systems": operating_systems,
|
|
"database management systems": database_management_systems,
|
|
"communication and networks": communication_and_networks}
|
|
skills["technical"].append(skill_dict)
|
|
print(json.dumps(skills, indent=4))
|
|
except:
|
|
print("Technical Skills could not be extracted")
|
|
|
|
try:
|
|
field = skills_container.find_element("xpath", "./div/div[2]")
|
|
|
|
industry_skills = get_content_as_list(field, "./div[1]/div[2]", "li")
|
|
|
|
consulting = get_content_as_list(field, "./div[2]/div[2]", "li")
|
|
|
|
|
|
skill_dict = {"industry and special skills": industry_skills,
|
|
"consulting": consulting
|
|
}
|
|
skills["field"].append(skill_dict)
|
|
print(json.dumps(skills, indent=4))
|
|
except:
|
|
print("Field Skills could not be extracted")
|
|
|
|
try:
|
|
other_skills = get_content_as_list(skills_container, "./div/div[3]", "li")
|
|
|
|
skills["other"].append(other_skills)
|
|
print(json.dumps(skills, indent=4))
|
|
except:
|
|
print("Other Skills could not be extracted")
|
|
|
|
return(skills)
|
|
|
|
#gets information on the publications and certifications of each person
|
|
#saves them in a json format
|
|
def get_life_events(driver, name):
|
|
#names = get_names()
|
|
driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
|
|
#driver.get("file:///C:/Users/em/Downloads/Schreiber,%20Philip%20(phs)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
|
|
|
|
life_event_record = {
|
|
"certificates": [],
|
|
"publications": []
|
|
}
|
|
life_events = driver.find_element("id", "pane-lifeEvents-de")
|
|
|
|
try:
|
|
certificates = life_events.find_elements("class name", "certificate")
|
|
|
|
for certificate in certificates:
|
|
title_and_date = certificate.find_element("xpath", "./div[1]").text
|
|
issuer = certificate.find_element("xpath", "./div[2]").text
|
|
name = title_and_date[:-8]
|
|
date = title_and_date[-7:]
|
|
|
|
certification = {"name": name,
|
|
"date": date,
|
|
"issuer": issuer}
|
|
|
|
life_event_record["certificates"].append(certification)
|
|
|
|
print(json.dumps(life_event_record, indent=4))
|
|
|
|
except:
|
|
print("Certificates could not be extracted")
|
|
|
|
try:
|
|
publications = life_events.find_elements("class name", "publication")
|
|
|
|
for publication in publications:
|
|
title_and_date = publication.find_element("xpath", "./div[1]").text
|
|
publisher = publication.find_element("xpath", "./div[2]").text
|
|
title = title_and_date[:-8]
|
|
date = title_and_date[-7:]
|
|
|
|
pub = {"title": title,
|
|
"date": date,
|
|
"publisher": publisher}
|
|
|
|
life_event_record["publications"].append(pub)
|
|
|
|
print(json.dumps(life_event_record, indent=4))
|
|
|
|
except:
|
|
print("Publications could not be extracted")
|
|
return(life_event_record)
|
|
|
|
#gets information on all the projects and the related tasks of each person
|
|
#saves them in a json format
|
|
def get_projects(driver, name):
|
|
#names = get_names()
|
|
driver.get("https://confluence.ppi.de/masterprofiles/viewprofile.action?username=" + name)
|
|
#driver.get("file:///C:/Users/em/Downloads/Seifert,%20Raphael%20(rse)%20-%20Mitarbeiter-Masterprofile%20-%20Confluence.html")
|
|
|
|
project_table = driver.find_element("id", "pane-projects-de")
|
|
projects = project_table.find_elements("tag name", "table")
|
|
|
|
all_projects = {"project": []}
|
|
|
|
try:
|
|
for project in projects:
|
|
period = project.find_element("class name", "projectPeriod").text
|
|
title = project.find_element("class name", "projectTitle").text
|
|
role = project.find_element("class name", "projectRole").text
|
|
client = project.find_element("class name", "projectClient").text
|
|
department = project.find_element("class name", "projectDepartment").text
|
|
description = project.find_element("class name", "projectDescription").text
|
|
tasks = get_content_as_list(project, "./tbody/tr/td[2]", "li")
|
|
|
|
proj = {
|
|
"period": period,
|
|
"title": title,
|
|
"role": role,
|
|
"client": client,
|
|
"department": department,
|
|
"description": description,
|
|
"tasks": tasks
|
|
}
|
|
|
|
all_projects["project"].append(proj)
|
|
|
|
print(json.dumps(all_projects, indent=4))
|
|
|
|
except:
|
|
print("Projects could not be extracted")
|
|
|
|
return(all_projects)
|
|
|
|
|
|
#extracts content from a certain tag and saves it in a list
|
|
def get_content_as_list(driver, relative_content_path, type):
|
|
|
|
content_div = driver.find_element("xpath", relative_content_path)
|
|
list_elements = content_div.find_elements("tag name",type)
|
|
element_list = []
|
|
|
|
for item in list_elements:
|
|
element_list.append(item.text)
|
|
|
|
return(element_list)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
with webdriver.Chrome() as driver:
|
|
# manuel Login
|
|
driver.get("https://confluence.ppi.de")
|
|
getpass.getpass("Press Enter after You are done logging in")
|
|
|
|
kuerzel = 'lawi'
|
|
|
|
#get_persis_info(driver, kuerzel)
|
|
#get_additional_info(driver, kuerzel)
|
|
#get_main_focus(driver, kuerzel)
|
|
get_skills(driver, kuerzel)
|
|
#get_life_events(driver, kuerzel)
|
|
#get_projects(driver, kuerzel) |