Selenium - undesired form filling - Python - python

I am relatively new to Python, and tried my hand at Selenium to download 5 years QoQ historical financial records for 800 companies. Was quite happy with the results, but needed quite a bit of manual work arounds to get what I wanted ( which I eventually got ).
Strange thing though, sometimes it filled in the search Company with a string which was not even in the python ( ie in Numpy was ie "IP", but in the field it seacrhed for "ARIP". I suspect its using the first item in the drop downlist instead of the item typed in. (see below )
please dont laugh too much about the attached code, I am not a pro, but it works and thats all that matters.
Any advice / teaching / changing code so I can learn would be appreciated.
ps - Tried Selenium several times. Never worked. Stumbled on a three in Stackoverflow about Xpath, and it worked like a charm !
import selenium
import pandas as pd
from selenium import webdriver
from pandas import DataFrame
import time
from selenium.webdriver.support.ui import Select
import os
import shutil
import glob
driver = webdriver.Chrome()
driver.get('MyURL')
username_input = '//*[#id="username"]'
password_input = '//*[#id="password"]'
login_submit = '//*[#id="login-btn"]'
driver.find_element_by_xpath(username_input).send_keys('MyLogin')
driver.find_element_by_xpath(password_input).send_keys('MyPwD')
driver.find_element_by_xpath(login_submit).click()
time.sleep(10)
dropdown = driver.find_element_by_id("menu-Company")
dropdown.click()
driver.find_element_by_xpath('//*[#id="Financial Statements"]').click()
companyname=[]
companyname=['ABPIF','AJA','ALLA','ALT','ALUCON','AMA','AMANAH','AP','AQUA','ARIN','B','BC','BM','CHAYO','CHEWA','CHG','CI','CIG','CIMBT','CITY','CK','CKP','CM','CMC','CMO','CNS','CPW','CRANE','CRD','CSC','CSP','CSS','CTARAF','CTW','CWT','D','DDD','DELTA','DEMCO','DIF','DIMET','DOD','DOHOME','DREIT','DRT','DTAC','DTC','EA','EASON','EASTW','ECF','ECL','EE','EGATIF','EGCO','EIC','EKH','EMC','EPCO','EPG','ERW','ESSO','ETE','EVER','F&D','FANCY','FE','FLOYD','FMT','FN','FNS','FORTH','FSS','FTE','FVC','GAHREIT','GBX','GC','GCAP','GEL','GENCO','GFPT','GIFT','GJS','GL','GLAND','GLOBAL','GLOCON','GOLD','GOLDPF','GPI','GPSC','GRAND','GREEN','GSC','GSTEEL','GTB','GULF','GUNKUL','GVREIT','GYT','HANA','HARN','HFT','HREIT','ICC','ICHI','ICN','IFEC','ILM','IMPACT','INET','IP','IT','J','K','KC','KKC','M','MC','ML','MM','NC','NDR','NETBAY','NEW','NEWS','NEX','NFC','NINE','NMG','PE','PF','PG','PICO','PK','PL','PRO','PT','PTG','RAM','ROCK','RPC','RS','S','SC','SE','SMART','SR','STA','STAR','STC','T','TC','TH','THAI','TM','TR','TRT','TRUE','TSE','TSI','TTA','TU','U','UP','UT','VI','VL','WG','WORK','WP','WR']
Counter=0
for name in companyname:
Counter=Counter+1
print('Name:',name)
companyfield='//*[#id="input-search"]'
driver.find_element_by_xpath(companyfield).send_keys(name)
driver.find_element_by_xpath(companyfield).send_keys(u'\ue007')
if Counter == 1:
# Give some time to set web page ie qoq, from and to date, and Profit and Los or Balance sheet
time.sleep(30)
elif Counter==30:
time.sleep(20) # give web site a breather
driver.find_element_by_xpath('//*[#id="export-link"]').click()
time.sleep(3)
driver.find_element_by_xpath(companyfield).clear()
# Strangely, if over 100 xlsx downloaded , it plays up - so move out to other folder and restart counters
time.sleep(0.3)
if Counter==70:
files = glob.iglob(os.path.join("/Users/cg/Downloads", "*.xlsx"))
for file in files:
if os.path.isfile(file):
ts = time.time()
shutil.copy2(file, "/Users/cg/Python/junk/SETScrape/Output1/"+str(ts)+".xlsx")
files = glob.glob("/Users/cg/Downloads/*.xlsx")
for f in files:
os.remove(f)
Counter=2

Related

Python - Downloading PDFs from website behind dropdown

For the site https://www.wsop.com/tournaments/results/, the objective is to download all available PDFs on the REPORTS section, behind all different drop down options where they are available.
Currently I am trying to do this using selenium, because I couldn't find an api, but I am open to other suggestions. For now the code is a bunch of copy-paste from relevant questions and YT videos.
My plan of attack is to select an option in the drop-down menu, press 'GO' (to load them), navigate to 'REPORTS' (if available) and download all the PDFs available. And then iterate over all options. Challenge 2 is then to get the PDFs to something like a dataframe to do some analysis.
Below is my current code, that only manages to download the top PDF of the by default selected option in the drop-down:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
#settings and loading webpage
options=Options()
options.headless=True
CD=ChromeDriverManager().install()
driver=webdriver.Chrome(CD,options=options)
params={'behavior':'allow','downloadPath':os.getcwd()+'\\PDFs'}
driver.execute_cdp_cmd('Page.setDownloadBehavior',params)
driver.get('https://www.wsop.com/tournaments/results/')
#Go through the dropdown
drp=Select(driver.find_element_by_id("CPHbody_aid"))
drp.select_by_index(0)
drp=Select(driver.find_element_by_id("CPHbody_grid"))
drp.select_by_index(1)
drp=Select(driver.find_element_by_id("CPHbody_tid"))
drp.select_by_index(5)
#Click the necessary buttons (section with issues)
driver.find_element_by_xpath('//*[#id="nav-tabs"]/a[6]').click()
#driver.find_element_by_name('GO').click()
#WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.LINK_TEXT, "GO"))).click()
#WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.LINK_TEXT, "REPORTS"))).click()
a=driver.find_element_by_id("reports").click()
I can navigate through the drop-down just fine (and it should be easy to iterate over them). However, I do not get the 'GO' button pressed. I tried it a bunch of different ways, a few I showed as a comment in the code.
I am able to press the REPORTS tab, but I think that breaks down when there are different amounts of tabs, the line in the comments might work better, but for now I am not able to download all PDFs anyway, it just takes the first PDF of the page.
Many thanks to whoever can help:)
The website is structured in such a way that you can loop through the years that a WSOP was played, then within each year you can loop through every event and get the data from page into a pandas dataframe. This is far more efficient than taking screenshots into PDFs
You can edit how far you want to go back with the from_year variable in line 5, going way back will obviously take more time. See the below script which will output all the data into csv. Note that not every event has POY points available. Also you'll need to pip install requests, pandas and bs4 if you haven't already.
import requests
import pandas as pd
from bs4 import BeautifulSoup
from_year = 2020
wsop_tounrament_url = 'https://www.wsop.com/tournaments/GetTournaments.aspx?aid=2'
wsop_resp = requests.get(wsop_tounrament_url)
soup = BeautifulSoup(wsop_resp.text,'html.parser')
years = [x['value'] for x in soup.find_all('option') if str(from_year) in x.text]
event_dfs = []
for year in years:
event_resp = requests.get(f'https://www.wsop.com/tournaments/GetEvents.aspx?grid={str(year)}')
soup = BeautifulSoup(event_resp.text,'html.parser')
event_ids = [x['value'] for x in soup.find_all('option')]
for event in event_ids:
page = 1
while True:
url = f'https://www.wsop.com/tournaments/results/?aid=2&grid={year}&tid={event}&rr=5&curpage={page}'
results = requests.get(url)
soup = BeautifulSoup(results.text,'html.parser')
info = soup.find('div',{'id':'eventinfo'})
dates = info.find('p').text
name = info.find('h1').text
year_name = soup.find('div',{'class':'content'}).find('h3').text.strip()
table = soup.find('div',{'id':'results'})
size = int(table.find('ul')['class'][0][-1])
rows = table.find_all('li')
if len(rows) <= size+1:
break
print(f'processing {year_name} - {name} - page {page}')
output = []
headers = []
for x in range(size):
series = []
for i, row in enumerate(rows):
if i == x:
headers.append(row.text)
continue
if i%size == x:
series.append(row.text)
output.append(series)
df = pd.DataFrame(output)
df = df.transpose()
df.columns = headers
df['year_name'] = year_name
df['event_id'] = event
df['year_id'] = year
df['event_name'] = name
df['dates'] = dates
df['url'] = url
event_dfs.append(df)
page += 1
print(f'Scraped {year_name} successfully')
final_df = pd.concat(event_dfs)
final_df.to_csv('wsop_output.csv',index=False)
I am not going to write you the whole script but here's how to click on the "go" button :
We can see from the Developper tools that the button is the only element to have the class "submit-red-button", so we can access it with : driver.find_elements_by_class_name('submit-red-button')[0].click()
You say that you can access the Reports tab but it did not work when I tested your program so just in case, you can use driver.find_elements_by_class_name('taboff')[4] to get it.
Then, all you need to do is to click on each pdf link in order to download the files

Why can't I scrape class elements using for loop in Selenium?

I am trying to scrape this website using python programming language and selenium. I was able to scrape data easily without for loop but whenever I use for loop to scrape elements I get errors, I also tried using while loop with try and except but it was no help at all. This is my python code:
from logging import exception
from typing import Text
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
import time
import pandas as pd
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import csv
from selenium import webdriver
PATH = "C:/ProgramData/Anaconda3/scripts/chromedriver.exe" #always keeps chromedriver.exe inside scripts to save hours of debugging
driver =webdriver.Chrome(PATH) #preety i,portant part
driver.get("https://www.gharghaderi.com/")
driver.implicitly_wait(10)
house = driver.find_elements_by_class_name('griddetails')
for x in house:
driver.get(x)
print(x.text)
and this is the error I am constantly getting error after using for loop
When you write this :
for x in house:
this means, for every x in house list.
your house list contains, all the Web elements with class griddetails
and in loop you are doing
driver.get(x)
which means you want to open every web element, which is wrong.
moreover get() supports URL in string format.
instead if you want to just print the details you can do this :
house = driver.find_elements_by_class_name('griddetails')
for x in house:
print(x.text)
this should give you proper output.
sample output :
रु. 2,50,00,000
Land: 0-4-0-4 Road: 12 ft
Chapali 2 Budhanilkantha, Kathmandu
Chandra Bahadur Karki
ID 472
Update 1 :
house_list = []
house = driver.find_elements_by_class_name('griddetails')
for x in house:
house_list.append(x.text)
data = {
'Details': house_list
}
df = pd.DataFrame.from_dict(data)
df.to_csv('out.csv', index = 0)
Imports :
import pandas as pd
Your error indicates an issue with the driver.get(x) line inside the for loop (line 19).
driver.get() expects an URL to open. However, I believe you pass it HTML bits with class name griddetails. What you want is the text inside that HTML.
Inside the loop try printing x or x.text and see what x is. Then you should try to find out how to extract the text you want.
img
Looks like the text you want is inside the span tag. So try looking there and find a way to extract the text from there. Sorry, I can't test the code myself atm.

How to web scrape data with Selenium and BeautifulSoup and output to CSV file?

I am relatively new to Python and the Stack Overflow community as well. I am using selenium to web scrape https://freightliner.com/dealer-search/ for dealership names and addresses in North/South America and have been able to print it as a single string with no problems, but I cannot figure out how to export it to a csv file. The difference between the way that I am printing it in my code and how I want to export it to csv is that I am printing the name and address as a single string delimited by a semicolon whereas I want to export it to a csv as separate columns (name, address). The following is what I have tried:
'''
#! python3
# fl_dealers.py - Scrapes freightliner website for north american locations.
# import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time, os, csv
from bs4 import BeautifulSoup
# set Chrome options to automatically download file
options = webdriver.ChromeOptions()
prefs = {'download.default_directory': r'C:\Users\username\Downloads\\'}
options.add_experimental_option('prefs',prefs)
chromedriver = 'C:/Users/username/chromedriver.exe'
# change directory to Downloads folder
os.chdir("C:\\Users\\username\\Downloads")
# create webdriver object and call Chrome options
browser = webdriver.Chrome(executable_path=chromedriver, options=options)
# maximize the browser window
browser.maximize_window()
# set wait time to allow browser to open
browser.implicitly_wait(10) # seconds
# open freightliner website
browser.get('https://freightliner.com/dealer-search/')
# maximize the browser window
browser.maximize_window()
time.sleep(5)
# find all locations in north america
search = browser.find_element_by_xpath('//*[#id="by-location"]/div/div/input')
ActionChains(browser).move_to_element(search).click().key_down(Keys.CONTROL).send_keys('a').key_up(Keys.CONTROL).send_keys("USA").perform()
#search.send_keys('USA')
search_button = browser.find_element_by_xpath('//*[#id="by-location"]/button').click()
time.sleep(10)
# create variable for webpage AFTER searching for results
page_source = browser.page_source
# create bs4 object
soup = BeautifulSoup(page_source, 'html.parser')
# create variables for dealer name and address
names = soup.find_all('h2')[1:]
addresses = soup.find_all(class_='address')
# print the names and addresses
for name, address in zip(names, addresses):
print(name.get_text(separator=" ").strip(), ";", address.get_text(separator=", ").strip())
with open('fl_dealers.csv', mode='w', newline='') as outputFile:
dealershipsCSV = csv.writer(outputFile)
dealershipsCSV.writerow(['name', 'address'])
for name in names:
dealer_name = name.get_text
for address in addresses:
dealer_address = address.get_text
dealershipsCSV.writerow([dealer_name, dealer_address])
'''
The code does create a CSV file, but it only creates the column headers and does not export any of the actual names and addresses. I have searched numerous stack overflow, github and youtube posts related to the issue, but have not been able to find a solution. I have reached the limit of my knowledge thus far. There is a high likelihood that I am missing something very simply. Alas, I am still new to Python.
One thing to note - The reasoning for entering "USA" in the search bar is to override the website's default of using my location to search for nearby dealers. Even though the query is for "USA", it returns all North/South American dealers which is what I want.
Any and all help is greatly appreciated! Thank you.
I guess your main problem that you should append the names and addresses in a loop according to amount of data length.
Also you should use append mode, not write mode.
Please try this:
from csv import writer
with open('fl_dealers.csv', 'a') as outputFile:
writer_object = writer(outputFile)
for name, address in zip(names, addresses):
writer_object.writerow(['name', 'address'])
outputFile.close()

Python, Selenium, web tab assistance

There is a web site, I can get the data I need with Python / Selenium (I am new to Selenium and Python)
on the web page there are TABS, I can get the data on the first tab as that one is active by default, I cannot get data on the second TAB.
I attached an image: this shows the data in the overview TAB, I want to get the data in the Fundamental TAB as well. This web page is investing.com.
As for the code: (I did not use everything yet, some were added for future use)
from time import sleep, strftime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import smtplib
from email.mime.multipart import MIMEMultipart
from bs4 import BeautifulSoup
url = 'https://www.investing.com/stock-screener/?
sp=country::6|sector::a|industry::a|equityType::a|exchange::a|last::1,1220|avg_volume::250000,15950000%3Ceq_market_cap;1'
chrome_path = 'E:\\BackUp\\IT\\__Programming\\Python\\_Scripts\\_Ati\\CSV\\chromedriver'
driver = webdriver.Chrome(chrome_path)
#driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get(url)
my_name = driver.find_elements_by_xpath("//td[#data-column-name='name_trans']")
my_symbol = driver.find_elements_by_xpath("//td[#data-column-name='viewData.symbol']")
my_last = driver.find_elements_by_xpath("//td[#data-column-name='last']")
my_change = driver.find_elements_by_xpath("//td[#data-column-name='pair_change_percent']")
my_marketcap = driver.find_elements_by_xpath("//td[#data-column-name='eq_market_cap']")
my_volume = driver.find_elements_by_xpath("//td[#data-column-name='turnover_volume']")
The code Above all works.
The Xpath of the second tab does not work.
PE Ratio is in the second tab. (in the fundamentals)
I tried the three:
my_peratio = driver.find_elements_by_xpath("//*[#id="resultsTable"]/tbody/tr[1]/td[4]")
my_peratio = driver.find_elements_by_xpath("//*[#id='resultsTable']")
my_peratio = driver.find_elements_by_xpath("//td[#data-column-name='eq_pe_ratio']")
There are no error messages but the string 'my_peratio' han nothing in it. It is empty.
I really appreciate if you could direct me to the right direction.
Thanks a lot
Ati
enter image description here
Probably the data which is shown on the second tab is loaded dynamically.
In that case, you have to click on the second tab to show the data first.
driver.find_element_by_xpath("selector_for_second_tab").click()
After that it should be possible to get the data.

Getting all visible text from a webpage using Selenium

I've been googling this all day with out finding the answer, so apologies in advance if this is already answered.
I'm trying to get all visible text from a large number of different websites. The reason is that I want to process the text to eventually categorize the websites.
After a couple of days of research, I decided that Selenium was my best chance. I've found a way to grab all the text, with Selenium, unfortunately the same text is being grabbed multiple times:
from selenium import webdriver
import codecs
filen = codecs.open('outoput.txt', encoding='utf-8', mode='w+')
driver = webdriver.Firefox()
driver.get("http://www.examplepage.com")
allelements = driver.find_elements_by_xpath("//*")
ferdigtxt = []
for i in allelements:
if i.text in ferdigtxt:
pass
else:
ferdigtxt.append(i.text)
filen.writelines(i.text)
filen.close()
driver.quit()
The if condition inside the for loop is an attempt at eliminating the problem of fetching the same text multiple times - it does not however, only work as planned on some webpages. (it also makes the script A LOT slower)
I'm guessing the reason for my problem is that - when asking for the inner text of an element - I also get the inner text of the elements nested inside the element in question.
Is there any way around this? Is there some sort of master element I grab the inner text of? Or a completely different way that would enable me to reach my goal? Any help would be greatly appreciated as I'm out of ideas for this one.
Edit: the reason I used Selenium and not Mechanize and Beautiful Soup is because I wanted JavaScript tendered text
Using lxml, you might try something like this:
import contextlib
import selenium.webdriver as webdriver
import lxml.html as LH
import lxml.html.clean as clean
url="http://www.yahoo.com"
ignore_tags=('script','noscript','style')
with contextlib.closing(webdriver.Firefox()) as browser:
browser.get(url) # Load page
content=browser.page_source
cleaner=clean.Cleaner()
content=cleaner.clean_html(content)
with open('/tmp/source.html','w') as f:
f.write(content.encode('utf-8'))
doc=LH.fromstring(content)
with open('/tmp/result.txt','w') as f:
for elt in doc.iterdescendants():
if elt.tag in ignore_tags: continue
text=elt.text or ''
tail=elt.tail or ''
words=' '.join((text,tail)).strip()
if words:
words=words.encode('utf-8')
f.write(words+'\n')
This seems to get almost all of the text on www.yahoo.com, except for text in images and some text that changes with time (done with javascript and refresh perhaps).
Here's a variation on #unutbu's answer:
#!/usr/bin/env python
import sys
from contextlib import closing
import lxml.html as html # pip install 'lxml>=2.3.1'
from lxml.html.clean import Cleaner
from selenium.webdriver import Firefox # pip install selenium
from werkzeug.contrib.cache import FileSystemCache # pip install werkzeug
cache = FileSystemCache('.cachedir', threshold=100000)
url = sys.argv[1] if len(sys.argv) > 1 else "https://stackoverflow.com/q/7947579"
# get page
page_source = cache.get(url)
if page_source is None:
# use firefox to get page with javascript generated content
with closing(Firefox()) as browser:
browser.get(url)
page_source = browser.page_source
cache.set(url, page_source, timeout=60*60*24*7) # week in seconds
# extract text
root = html.document_fromstring(page_source)
# remove flash, images, <script>,<style>, etc
Cleaner(kill_tags=['noscript'], style=True)(root) # lxml >= 2.3.1
print root.text_content() # extract text
I've separated your task in two:
get page (including elements generated by javascript)
extract text
The code is connected only through the cache. You can fetch pages in one process and extract text in another process or defer to do it later using a different algorithm.

Categories