from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import numpy as np import time #some notes: #the links to find elements can be found with: #right click on browser and choose Untersuchen(Q) #shows all elements => find the needed table by hand ''' define a driver = mousepointer using Chrome as Browser (Firefox did not work) OUT: driver: driver Selenium Object, which can connect to website ''' def getDriver(): #definitely need sleep since first have to establish Selenium time.sleep(10) options = webdriver.ChromeOptions() options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--ignore-ssl-errors=yes') options.add_argument('--ignore-certificate-errors') print("Now will connect") driver = webdriver.Remote(command_executor='http://sel_browser_test_vimes:4444/wd/hub', options=options) #sel_browser_test gives the URL of that container, so that we always use correct URL to connect #also need _vimes since we changed the name of the container in this dir print("did connect successfully") return driver ''' find names of each Group of PPI.X on Dobby IN: driver: driver Selenium Object, which can connect to website OUT: groupsPpiX: Array(n) of string ''' def findGroupNames(driver): groupsPpiX = [] #define the link to access and access it pathDobbyProjects = "https://dobby.ppi.int/cgi-bin/ad/adgroups.pl?ou=Projekte" print(pathDobbyProjects) try: driver.get(pathDobbyProjects) print("on website now") except: print("could not connect to Dobby website") #get data from Dobby try: #get the table with all projects (the lower table) table = driver.find_element("xpath", "/html/body/div/div/table[2]/tbody") #go through each row in that table for row in table.find_elements("xpath", ".//tr[not(position()=1)]"): try: groupId = row.find_element("xpath", ".//td[1]").text #first column in table is Group print(groupId) groupDesc = row.find_element("xpath", ".//td[2]").text #second is Description if groupId[:8] == 'prj_ppix': #only choose groups starting like that = from PPI.X groupsPpiX.append([groupId, groupDesc]) #append except NoSuchElementException: print("NoSuchElementException") #driver.close() print("done going through all projects") except: print("no such tables found, where we get group names") return groupsPpiX ''' find all persons working at PPI.X and save names and short notation IN: groupsNames: Array(n,2) of string OUT: persons: set(m) of string arrays(2) ''' def getPersons(driver, groupsNames): persons = set() groupPath = "https://dobby.ppi.int/cgi-bin/ad/adgroups.pl?group=" for group, groupDesc in groupsNames: try: driver.get(groupPath + group) try: table = driver.find_element("xpath", "/html/body/div/div/table[2]/tbody") #go though each row in table with short notation and name #do not consider first row (tr[not(position()=1)]) since caption only for row in table.find_elements("xpath", ".//tr[not(position()=1)]"): try: short = row.find_element("xpath", ".//td[1]").text print(short) name = row.find_element("xpath", ".//td[2]").text #append short notation to the set of all persons, but use tuple, because they are differentiable persons.add(tuple([short, name])) except NoSuchElementException: print("NoSuchElementException") except: print("no such tables found, where we get person names") except: print("could not connect to group webpage") #driver.close() return persons ''' scrappes all data from Dobby persons page which only are their phone numbers it is the same order as the names IN: driver: driver Selenium object names: Array(n,2) of string OUT: allNumbers: array(n, 2) of string ''' def getDobbyPersonData(driver, names): personPath = "https://dobby.ppi.int/cgi-bin/ad/adgroups.pl?user=" allNumbers = np.zeros((len(names), 2), dtype='U60') for s, short in enumerate(names): print(short[0]) #short[0] has kuerzel and short[1] the full name print(personPath + short[0]) driver.get(personPath + short[0]) mobileNumber = driver.find_element("xpath", "/html/body/div/div/table[2]/tbody/tr[9]/td[2]").text print(mobileNumber) #phoneNumber = driver.find_element("xpath", "/html/body/div/div/table[2]/tbody/tr[8]/td[2]").text allNumbers[s, 0] = mobileNumber allNumbers[s, 1] = short[0] print(allNumbers) return allNumbers ''' From here on only some tests when debugging ''' ''' test the Driver in Docker with a public webpage on YOUTUBE and print some data also use the Proxy we need for Dobby ''' def testDriver(): #need to wait a little to assure that the selenium container is up and running, before connecting to it time.sleep(3) options = webdriver.ChromeOptions() PROXY = "www-cache.ppi.int:3128" #PPI Proxy Name options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--ignore-ssl-errors=yes') options.add_argument('--ignore-certificate-errors') #options.add_argument('--proxy-server=%s' % PROXY) #set Proxy to get onto dobby.int, but does not work print("Now will connect") driver = webdriver.Remote(command_executor='http://sel_browser_test_vimes:4444/wd/hub', options=options) #sel_browser_test gives the URL of that container, so that we always use correct URL to connect #also need _vimes since we changed the name of the container in this dir print("did connect successfully") #just a test, to see if it works for a public website => it does pathTest = "https://about.youtube/" driver.get(pathTest) print(pathTest) print("someTesting only") #print("full page") #print(driver.page_source) #driver will wait for up to 10 seconds wait = WebDriverWait(driver, 10) try: elements = wait.until(EC.presence_of_all_elements_located(("xpath", '//*[@id="content"]'))) print(elements) for e in elements: print(e.get_attribute("class")) except: print("Youtube does not work") try: print("now the element of the wrong page") elements = wait.until(EC.presence_of_all_elements_located(("id", 'sub-frame-error'))) print(elements) except: print("at least not the crashing proxy webpage") ''' This was used to test the connection to the Dobby webpage for example if the Proxy can help to connect or how the crashed webpage looks like ''' def testDobbyCrash(): time.sleep(5) #define a driver = mousepointer using Chrome as Browser (Firefox did not work) options = webdriver.ChromeOptions() PROXY = "www-cache.ppi.int:3128" #PPI Proxy Name options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--ignore-ssl-errors=yes') options.add_argument('--ignore-certificate-errors') #options.add_argument('--proxy-server=%s' % PROXY) #set Proxy to get onto dobby.int print("Now will connect") driver = webdriver.Remote(command_executor='http://sel_browser_test_vimes:4444/wd/hub', options=options) #sel_browser_test gives the URL of that container, so that we always use correct URL to connect #also need _vimes since we changed the name of the container in this dir print("did connect successfully") #define the link to access and access it pathDobbyProjects = "https://dobby.ppi.int" print(pathDobbyProjects) driver.get(pathDobbyProjects) time.sleep(5) print("full page") print(driver.find_elements("xpath", "/html/body/img")) #print(driver.page_source) pathDobbyProjects = "https://dobby.ppi.int/cgi-bin/ad/adgroups.pl?ou=Projekte" try: driver.get(pathDobbyProjects) print("on website now") except: print("could not connect to Dobby website") time.sleep(5) notFound = driver.find_elements("xpath", "/html/body/div/div[1]/div[1]/h1") print("is website not found?") print(notFound) notFound = driver.find_elements("id", "root") print("maybe root?") print(notFound) #find the table with all projects (the lower table) table = driver.find_elements("xpath", "/html/body/div/div/table[2]/tbody/tr[3]") print(table) #driver will wait for up to 10 seconds wait = WebDriverWait(driver, 10) try: print("now the element of the wrong page") elements = wait.until(EC.presence_of_all_elements_located(("id", 'sub-frame-error'))) print(elements) except: print("at least not the crashing proxy webpage")