287 lines
10 KiB
Python
287 lines
10 KiB
Python
from selenium import webdriver
|
|
from selenium.common.exceptions import NoSuchElementException
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
import numpy as np
|
|
import time
|
|
|
|
#some notes:
|
|
#the links to find elements can be found with:
|
|
#right click on browser and choose Untersuchen(Q)
|
|
#shows all elements => find the needed table by hand
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
define a driver = mousepointer using Chrome as Browser (Firefox did not work)
|
|
OUT:
|
|
driver: driver Selenium Object, which can connect to website
|
|
'''
|
|
def getDriver():
|
|
#definitely need sleep since first have to establish Selenium
|
|
time.sleep(10)
|
|
options = webdriver.ChromeOptions()
|
|
options.add_argument('--disable-blink-features=AutomationControlled')
|
|
options.add_argument('--ignore-ssl-errors=yes')
|
|
options.add_argument('--ignore-certificate-errors')
|
|
|
|
print("Now will connect")
|
|
driver = webdriver.Remote(command_executor='http://sel_browser_test_vimes:4444/wd/hub', options=options) #sel_browser_test gives the URL of that container, so that we always use correct URL to connect
|
|
#also need _vimes since we changed the name of the container in this dir
|
|
print("did connect successfully")
|
|
|
|
return driver
|
|
|
|
|
|
|
|
|
|
'''
|
|
find names of each Group of PPI.X on Dobby
|
|
IN:
|
|
driver: driver Selenium Object, which can connect to website
|
|
OUT:
|
|
groupsPpiX: Array(n) of string
|
|
'''
|
|
def findGroupNames(driver):
|
|
groupsPpiX = []
|
|
|
|
#define the link to access and access it
|
|
pathDobbyProjects = "https://dobby.ppi.int/cgi-bin/ad/adgroups.pl?ou=Projekte"
|
|
print(pathDobbyProjects)
|
|
try:
|
|
driver.get(pathDobbyProjects)
|
|
print("on website now")
|
|
except:
|
|
print("could not connect to Dobby website")
|
|
|
|
#get data from Dobby
|
|
try:
|
|
#get the table with all projects (the lower table)
|
|
table = driver.find_element("xpath", "/html/body/div/div/table[2]/tbody")
|
|
|
|
#go through each row in that table
|
|
for row in table.find_elements("xpath", ".//tr[not(position()=1)]"):
|
|
try:
|
|
groupId = row.find_element("xpath", ".//td[1]").text #first column in table is Group
|
|
print(groupId)
|
|
groupDesc = row.find_element("xpath", ".//td[2]").text #second is Description
|
|
if groupId[:8] == 'prj_ppix': #only choose groups starting like that = from PPI.X
|
|
groupsPpiX.append([groupId, groupDesc]) #append
|
|
except NoSuchElementException:
|
|
print("NoSuchElementException")
|
|
#driver.close()
|
|
print("done going through all projects")
|
|
except:
|
|
print("no such tables found, where we get group names")
|
|
|
|
return groupsPpiX
|
|
|
|
|
|
|
|
|
|
'''
|
|
find all persons working at PPI.X
|
|
and save names and short notation
|
|
IN:
|
|
groupsNames: Array(n,2) of string
|
|
OUT:
|
|
persons: set(m) of string arrays(2)
|
|
'''
|
|
def getPersons(driver, groupsNames):
|
|
persons = set()
|
|
groupPath = "https://dobby.ppi.int/cgi-bin/ad/adgroups.pl?group="
|
|
|
|
for group, groupDesc in groupsNames:
|
|
try:
|
|
driver.get(groupPath + group)
|
|
try:
|
|
table = driver.find_element("xpath", "/html/body/div/div/table[2]/tbody")
|
|
|
|
#go though each row in table with short notation and name
|
|
#do not consider first row (tr[not(position()=1)]) since caption only
|
|
for row in table.find_elements("xpath", ".//tr[not(position()=1)]"):
|
|
try:
|
|
short = row.find_element("xpath", ".//td[1]").text
|
|
print(short)
|
|
name = row.find_element("xpath", ".//td[2]").text
|
|
#append short notation to the set of all persons, but use tuple, because they are differentiable
|
|
persons.add(tuple([short, name]))
|
|
except NoSuchElementException:
|
|
print("NoSuchElementException")
|
|
except:
|
|
print("no such tables found, where we get person names")
|
|
except:
|
|
print("could not connect to group webpage")
|
|
|
|
|
|
|
|
#driver.close()
|
|
return persons
|
|
|
|
|
|
|
|
'''
|
|
scrappes all data from Dobby persons page
|
|
which only are their phone numbers
|
|
it is the same order as the names
|
|
IN:
|
|
driver: driver Selenium object
|
|
names: Array(n,2) of string
|
|
OUT:
|
|
allNumbers: array(n, 2) of string
|
|
'''
|
|
def getDobbyPersonData(driver, names):
|
|
personPath = "https://dobby.ppi.int/cgi-bin/ad/adgroups.pl?user="
|
|
allNumbers = np.zeros((len(names), 2), dtype='U60')
|
|
|
|
for s, short in enumerate(names):
|
|
print(short[0]) #short[0] has kuerzel and short[1] the full name
|
|
print(personPath + short[0])
|
|
driver.get(personPath + short[0])
|
|
|
|
mobileNumber = driver.find_element("xpath", "/html/body/div/div/table[2]/tbody/tr[9]/td[2]").text
|
|
print(mobileNumber)
|
|
#phoneNumber = driver.find_element("xpath", "/html/body/div/div/table[2]/tbody/tr[8]/td[2]").text
|
|
allNumbers[s, 0] = mobileNumber
|
|
allNumbers[s, 1] = short[0]
|
|
print(allNumbers)
|
|
|
|
return allNumbers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
From here on only some tests when debugging
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
test the Driver in Docker with a public webpage on YOUTUBE and print some data
|
|
also use the Proxy we need for Dobby
|
|
'''
|
|
def testDriver():
|
|
#need to wait a little to assure that the selenium container is up and running, before connecting to it
|
|
time.sleep(3)
|
|
|
|
options = webdriver.ChromeOptions()
|
|
PROXY = "www-cache.ppi.int:3128" #PPI Proxy Name
|
|
options.add_argument('--disable-blink-features=AutomationControlled')
|
|
options.add_argument('--ignore-ssl-errors=yes')
|
|
options.add_argument('--ignore-certificate-errors')
|
|
#options.add_argument('--proxy-server=%s' % PROXY) #set Proxy to get onto dobby.int, but does not work
|
|
|
|
print("Now will connect")
|
|
driver = webdriver.Remote(command_executor='http://sel_browser_test_vimes:4444/wd/hub', options=options) #sel_browser_test gives the URL of that container, so that we always use correct URL to connect
|
|
#also need _vimes since we changed the name of the container in this dir
|
|
print("did connect successfully")
|
|
|
|
|
|
#just a test, to see if it works for a public website => it does
|
|
pathTest = "https://about.youtube/"
|
|
driver.get(pathTest)
|
|
print(pathTest)
|
|
print("someTesting only")
|
|
#print("full page")
|
|
#print(driver.page_source)
|
|
|
|
#driver will wait for up to 10 seconds
|
|
wait = WebDriverWait(driver, 10)
|
|
|
|
try:
|
|
elements = wait.until(EC.presence_of_all_elements_located(("xpath", '//*[@id="content"]')))
|
|
print(elements)
|
|
for e in elements:
|
|
print(e.get_attribute("class"))
|
|
except:
|
|
print("Youtube does not work")
|
|
|
|
try:
|
|
print("now the element of the wrong page")
|
|
elements = wait.until(EC.presence_of_all_elements_located(("id", 'sub-frame-error')))
|
|
print(elements)
|
|
except:
|
|
print("at least not the crashing proxy webpage")
|
|
|
|
|
|
|
|
|
|
'''
|
|
This was used to test the connection to the Dobby webpage
|
|
for example if the Proxy can help to connect
|
|
or how the crashed webpage looks like
|
|
'''
|
|
def testDobbyCrash():
|
|
time.sleep(5)
|
|
#define a driver = mousepointer using Chrome as Browser (Firefox did not work)
|
|
options = webdriver.ChromeOptions()
|
|
PROXY = "www-cache.ppi.int:3128" #PPI Proxy Name
|
|
options.add_argument('--disable-blink-features=AutomationControlled')
|
|
options.add_argument('--ignore-ssl-errors=yes')
|
|
options.add_argument('--ignore-certificate-errors')
|
|
#options.add_argument('--proxy-server=%s' % PROXY) #set Proxy to get onto dobby.int
|
|
|
|
print("Now will connect")
|
|
driver = webdriver.Remote(command_executor='http://sel_browser_test_vimes:4444/wd/hub', options=options) #sel_browser_test gives the URL of that container, so that we always use correct URL to connect
|
|
#also need _vimes since we changed the name of the container in this dir
|
|
print("did connect successfully")
|
|
|
|
|
|
#define the link to access and access it
|
|
pathDobbyProjects = "https://dobby.ppi.int"
|
|
print(pathDobbyProjects)
|
|
driver.get(pathDobbyProjects)
|
|
time.sleep(5)
|
|
print("full page")
|
|
print(driver.find_elements("xpath", "/html/body/img"))
|
|
#print(driver.page_source)
|
|
|
|
pathDobbyProjects = "https://dobby.ppi.int/cgi-bin/ad/adgroups.pl?ou=Projekte"
|
|
try:
|
|
driver.get(pathDobbyProjects)
|
|
print("on website now")
|
|
|
|
except:
|
|
print("could not connect to Dobby website")
|
|
|
|
time.sleep(5)
|
|
notFound = driver.find_elements("xpath", "/html/body/div/div[1]/div[1]/h1")
|
|
print("is website not found?")
|
|
print(notFound)
|
|
notFound = driver.find_elements("id", "root")
|
|
print("maybe root?")
|
|
print(notFound)
|
|
|
|
|
|
#find the table with all projects (the lower table)
|
|
table = driver.find_elements("xpath", "/html/body/div/div/table[2]/tbody/tr[3]")
|
|
print(table)
|
|
|
|
|
|
#driver will wait for up to 10 seconds
|
|
wait = WebDriverWait(driver, 10)
|
|
try:
|
|
print("now the element of the wrong page")
|
|
elements = wait.until(EC.presence_of_all_elements_located(("id", 'sub-frame-error')))
|
|
print(elements)
|
|
except:
|
|
print("at least not the crashing proxy webpage")
|
|
|
|
|