Scraping Vocabulary using Selenium and parsing to DataFrame - python

There is this little program that goes to a vocabulary, print all the words from that page and then click at the button to go to the next page and print again all the vocabulary on that page.
I used a loop to repeat the process and loop through all the words spread on multiple pages.
#Create csv
outfile = open("Vocab.csv","w",newline='')
writer = csv.writer(outfile)
#Define the dataframe
df = pd.DataFrame(columns=['rating'])
PATH="C:\Program Files (x86)\chromedriver.exe"
driver= webdriver.Chrome(PATH)
driver.get("https://sq.m.wiktionary.org/w/index.php?title=Kategoria:Shqip&pagefrom=agall%C3%ABk#mw-pages")
for x in range(3):
rating_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mw-pages > div > div > div > ul"))
)
rating=rating_element.text
print(rating)
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "faqja pasardhëse"))
)
element.click()
df2 = pd.DataFrame([rating],columns=['rating'])
df = df.append(df2,ignore_index=True)
The code itself works perfectly fine, however when I tried to implement the function of parsing all the data into a DataFrame, I only get an empty Csv File. I'm trying to have only one column with the thousands of words.

You can iterate over each word to append to the column:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import selenium.common.exceptions
import os
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--window-size=1920x1080")
# chrome_options.add_argument("--headless")
chrome_driver = os.getcwd() + "\\chromedriver.exe"
driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)
# Define the dataframe
df = pd.DataFrame(columns=['rating'])
driver.get("https://sq.m.wiktionary.org/w/index.php?title=Kategoria:Shqip&pagefrom=agall%C3%ABk#mw-pages")
for x in range(200):
rating_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mw-pages > div > div > div > ul"))
)
rating = rating_element.text
for word in rating.split('\n'):
df2 = pd.DataFrame([word], columns=['rating'])
df = df.append(df2, ignore_index=True)
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "faqja pasardhëse"))
)
element.click()
except selenium.common.exceptions.TimeoutException:
break
print(df)
df.to_csv('word_list.csv', encoding='utf-8', index=False)
Outputs
rating
0 agallëk
1 agar
2 agave
3 agde
4 ageshë
.. ...
595 ankim
596 ankimor
597 ankohem
598 ankoj
599 ankojë
[600 rows x 1 columns]
Edit
Added the option to write to a file.

Related

Selenium python- skip an iteration if a web element is not present

Selenium python- skip an iteration if a web element is not present
Please I'm trying to fetch data from https://b2b.baidu.com/ after inputting a keyword in a search field on the website. I want to skip an iteration if an element is not present on the first page.
I know this can work seamlessly but I'm still a novice and can't figure out what I'm doing wrongly at the moment. Your help will be greatly appreciated.
Here is what I've done:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import time
import pandas as pd
website = 'https://b2b.baidu.com/'
path = "C:/Users/ICT/chromedriver.exe"
driver = webdriver.Chrome(path)
driver.get(website)
driver.implicitly_wait(4)
wait = WebDriverWait(driver, 10)
driver.maximize_window()
# the search terms which contains location and keyword are from a dataframe in another file
from baidu_locations import location_key_row
from baidu_locations import location_data_col
from baidu_locations import key_data_col
for i in range(1, 6):
website = []
rep_name = []
contact = []
location = []
keyword = []
business_name = []
# Input location and keyword
enter_query = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[placeholder='我要采购…']")))
enter_query.clear()
enter_query.send_keys(location_key_row[i-1])
location_query = location_data_col[i-1]
location.append(location_query)
keyword_query = location_data_col[i-1]
keyword.append(keyword_query)
search_type = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "li[class='search-type']")))
search_type.click()
# If *```company_url```* element is not available, I want to go back to the next *```enter_query```* and continue the iteration.
try:
company_url = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div > div:nth-child(1) > div > div > div > div.title-container > span > span.name > a')))
website.append(company_url.get_property('href'))
first_location = wait.until(EC.element_to_be_clickable((By.XPATH, '(//span[#class="title link"])[1]')))
first_location.click()
driver.switch_to.window(driver.window_handles[1])
name = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div[class='shop-contact-warp shop-contact-vertical-warp'] div[class='top'] div span[class='show-name']")))
business_name.append(name.text)
#print(reps)
representative = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.shop-index-new-right> div > div.top > div:nth-child(1) > div > div.text > p.sub-text")))
rep_name.append(representative.text)
phone_option = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'查看电话')]")))
phone_option.click()
popup_contact = driver.window_handles[1]
driver.switch_to.window(popup_contact)
phone_number = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'p[class="phone"]')))
contact.append(phone_number.text)
#print(contact_no)
time.sleep(2)
return_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'返回')]")))
return_button.click()
driver.close()
driver.switch_to.window(driver.window_handles[0])
except:
continue
df = pd.DataFrame({'Location': location, 'Keyword': keyword, 'Name': business_name, 'Representative': rep_name, 'Phone': contact, 'Link': website})
So if the company_url variable element is present on the first page, I want to click on it, go to the new tab and copy the data on that page and return to the first tab and repeat the process.
If the element variable company_url is not present, I want to skip that iteration and input the next search term enter_query from the specified range.
I want to fetch the data of enter_query where company_url element is present and save in a dataframe.
This code block seems to only fetch one row of data no matter the range I set.
Thank you for your help. Kindly let me know if my question is not clear or any questions you might have.
enter image description here
Well I guess you only want to loop in specific conditions. In that case, why not increment iterator only when satisfied your conditions?
Hope below could be a help
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
website = "https://b2b.baidu.com/"
path = "C:/Users/ICT/chromedriver.exe"
driver = webdriver.Chrome(path)
driver.get(website)
driver.implicitly_wait(4)
wait = WebDriverWait(driver, 10)
driver.maximize_window()
# the search terms which contains location and keyword are from a dataframe in another file
from baidu_locations import key_data_col, location_data_col, location_key_row
# ------------- added -------------
i = index_from = 1
index_to = 6
# ---------------------------------
# ------------------ modified ------------------
while i < index_to:
# ----------------------------------------------
website = []
rep_name = []
contact = []
location = []
keyword = []
business_name = []
# Input location and keyword
enter_query = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "input[placeholder='我要采购…']"))
)
enter_query.clear()
enter_query.send_keys(location_key_row[i - 1])
location_query = location_data_col[i - 1]
location.append(location_query)
keyword_query = location_data_col[i - 1]
keyword.append(keyword_query)
search_type = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "li[class='search-type']"))
)
search_type.click()
# ------------------ modified ------------------
try:
company_url = wait.until(
EC.element_to_be_clickable(
(
By.CSS_SELECTOR,
"div > div:nth-child(1) > div > div > div > div.title-container > span > span.name > a",
)
)
)
except:
continue
try:
# ----------------------------------------------
website.append(company_url.get_property("href"))
first_location = wait.until(
EC.element_to_be_clickable((By.XPATH, '(//span[#class="title link"])[1]'))
)
first_location.click()
driver.switch_to.window(driver.window_handles[1])
name = wait.until(
EC.element_to_be_clickable(
(
By.CSS_SELECTOR,
"div[class='shop-contact-warp shop-contact-vertical-warp'] div[class='top'] div span[class='show-name']",
)
)
)
business_name.append(name.text)
# print(reps)
representative = wait.until(
EC.element_to_be_clickable(
(
By.CSS_SELECTOR,
"div.shop-index-new-right> div > div.top > div:nth-child(1) > div > div.text > p.sub-text",
)
)
)
rep_name.append(representative.text)
phone_option = wait.until(
EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'查看电话')]"))
)
phone_option.click()
popup_contact = driver.window_handles[1]
driver.switch_to.window(popup_contact)
phone_number = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'p[class="phone"]'))
)
contact.append(phone_number.text)
# print(contact_no)
time.sleep(2)
return_button = wait.until(
EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'返回')]"))
)
return_button.click()
driver.close()
driver.switch_to.window(driver.window_handles[0])
# ------------- added -------------
# No problem here
i += 1
# ---------------------------------
except:
continue
df = pd.DataFrame(
{
"Location": location,
"Keyword": keyword,
"Name": business_name,
"Representative": rep_name,
"Phone": contact,
"Link": website,
}
)

Python Selenium Football Odds Webscraping

I am trying to scrape odds from https://en.stoiximan.gr/live. While my code is working, I get an error for having uneven lists in my final dataframe. Unfortunately, stoiximan seems to place 3-way odds together with over/under odds and suspended/locked matches (as in the picture).
What I am trying to do is to delete both home and away teams from their respective lists if their odds are over/under or locked. Any suggestions?
Here 's my code so far:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import openpyxl
import os
#launch chrome and keep window open
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
#visit en.stoiximan.gr and maximize window
driver.get("https://en.stoiximan.gr/live/")
driver.maximize_window()
#close modal window
try:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((
By.XPATH, "//button[#class='sb-modal__close__btn uk-modal-close-default uk-icon uk-
close']"
))).click()
except:
pass
#accept cookies
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((
By.ID, "onetrust-accept-btn-handler"
))).click()
#Initialize storage for stoiximan
stoiximan_home_teams_list = []
stoiximan_away_teams_list = []
stoiximan_home_odds_list = []
stoiximan_draw_odds_list = []
stoiximan_away_odds_list = []
#grab all home/away teams and explicit odds
try:
stoiximan_home_teams = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-
row__container--row']/div[1]/a/div[1]/div[1]/span"))
)
stoiximan_away_teams = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-row__container--row']/div[1]/a/div[1]/div[2]/span"))
)
stoiximan_home_odds = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-row__container--row']/div[2]/div/button[1]/span[2]"))
)
stoiximan_draw_odds = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-row__container--row']/div[2]/div/button[2]/span[2]"))
)
stoiximan_away_odds = WebDriverWait(driver, 1).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-row__container--row']/div[2]/div/button[3]/span[2]"))
)
except:
driver.quit()
#loop each home team and append the lists
for stoiximan_home_team in stoiximan_home_teams:
stoiximan_home_teams_list.append(stoiximan_home_team.get_attribute('innerText'))
for stoiximan_away_team in stoiximan_away_teams:
stoiximan_away_teams_list.append(stoiximan_away_team.get_attribute('innerText'))
for stoiximan_home_odd in stoiximan_home_odds:
stoiximan_home_odds_list.append(stoiximan_home_odd.text)
for stoiximan_draw_odd in stoiximan_draw_odds:
stoiximan_draw_odds_list.append(stoiximan_draw_odd.text)
for stoiximan_away_odd in stoiximan_away_odds:
stoiximan_away_odds_list.append(stoiximan_away_odd.text)
print(stoiximan_home_teams_list)
print(len(stoiximan_home_teams_list))
print(stoiximan_away_teams_list)
print(len(stoiximan_away_teams_list))
print(stoiximan_home_odds_list)
print(len(stoiximan_home_odds_list))
print(stoiximan_draw_odds_list)
print(len(stoiximan_draw_odds_list))
print(stoiximan_away_odds_list)
print(len(stoiximan_away_odds_list))
#make str to float in odds lists
stoiximan_home_odds_list_float = [float(i) for i in stoiximan_home_odds_list]
stoiximan_draw_odds_list_float = [float(j) for j in stoiximan_draw_odds_list]
stoiximan_away_odds_list_float = [float(k) for k in stoiximan_away_odds_list]
#create dictionary for data
stoiximan_dict = {'Stoiximan Home Team': stoiximan_home_teams_list,
'Stoiximan Away Team': stoiximan_away_teams_list,
'Stoiximan Home Odd': stoiximan_home_odds_list_float,
'Stoiximan Draw Odd': stoiximan_draw_odds_list_float,
'Stoiximan Away Odd': stoiximan_away_odds_list_float
}
#create dataframe for data
df4 = pd.DataFrame(stoiximan_dict)
print(df4)
#write to excel file and open it
df4.to_excel(r'C:\Users\sweet_000\Desktop\data.xlsx', sheet_name="stoiximan", index=False)
os.system('start EXCEL.EXE "C:\\Users\\sweet_000\\Desktop\\data.xlsx"')
driver.quit()

Export multiple scraped items to a csv file in selenium

Ok so i have this code to scrape this link.... https://datacentersupport.lenovo.com/gb/en/products/storage/lenovo-storage/s3200/70l8/parts/display/compatible. The code I have scrapes all the details perfectly.......except the substitutes. Here's the full code. Have I missed something?
from selenium import webdriver
from time import sleep
import pandas as pd
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
# initializing webdriver
driver = webdriver.Chrome(executable_path="~~chromedriver.exe")
url = " https://datacentersupport.lenovo.com/gb/en/products/storage/lenovo-storage/s3200/70l8/parts/display/compatible."
driver.get(url)
sleep(5)
results = []
#getting breadcrumbs
bread1 = driver.find_element_by_xpath("//span[#class='prod-catagory-name']")
bread2 = driver.find_element_by_xpath("//span[#class='prod-catagory-name']/a")
#grabbing table data and navigating
pages = int(driver.find_element_by_xpath("//div[#class='page-container']/span[#class='icon-s-right active']/preceding-sibling::span[1]").text)
num = pages -1
for _ in range(pages):
rows = driver.find_elements_by_xpath("//table/tbody/tr/td[2]/div")
for row in rows:
parts = row.text
results.append([url,bread1.text,parts])
try:
for element in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "span[class='icon-s-down']"))):
driver.execute_script("arguments[0].click();", element)
sleep(5)
substitute = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//span[#class='icon-s-up']//following::tr[3]/td[contains(#class,'enabled-border')]//div[text()]")))
for sub in substitute:
subs = sub.text
results.append(subs)
except TimeoutException:
pass
except NoSuchElementException:
break
finally:
try:
pagination = driver.find_element_by_xpath("//div[#class='page-container']/span[#class='icon-s-right active']").click()
sleep(3)
except NoSuchElementException:
break
df = pd.DataFrame(results)
df.to_csv('datacenter2.csv', index=False)
driver.quit()
The results were far from pleasing. I know am missing something within the loops. But am not sure what. Any suggestion would be highly appreciated.
This is the result I get :
I need to print the code alongside the numbers in the last column for each row it scrapes

How to save to CSV instead of printing to terminal Selenium webscrape data

I was finally able to scrape data from the website! And also print out the Headlines and Dates to the terminal. But I want to have it saved to a CSV file with a column for the headlines and a column for the dates. How do I do that?
My codes attached below:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver = webdriver.Chrome(
chrome_options=options,
executable_path=r"//usr/local/Caskroom/chromedriver/81.0.4044.69/chromedriver")
driver.get(
"https://www.nytimes.com/search?dropmab=true&endDate=20180111&query=nyc&sections=New%20York%7Cnyt%3A%2F%2Fsection%2F39480374-66d3-5603-9ce1-58cfa12988e2&sort=best&startDate=20180107")
myLength = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located(
(By.XPATH, "//figure[#class='css-tap2ym']//following::a[1]"))))
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
(By.XPATH, "//div[#class='css-vsuiox']//button[#data-testid='search-show-more-button']"))).click()
WebDriverWait(driver, 20).until(lambda driver: len(driver.find_elements_by_xpath(
"//figure[#class='css-tap2ym']//following::a[1]")) > myLength)
titles = driver.find_elements_by_xpath(
"//figure[#class='css-tap2ym']//following::a[1]")
myLength = len(titles)
except TimeoutException:
break
headlines_element = driver.find_elements_by_xpath('//p[#class="css-16nhkrn"]')
headlines = [x.text for x in eheadlines_element]
print('headlines:')
print(headlines, '\n')
dates_element = driver.find_elements_by_xpath("//time[#class='css-17ubb9w']")
dates = [x.text for x in dates_element]
print("dates:")
print(dates, '\n')
for headlines, dates in zip(headlines, dates):
print("Headlines : Dates")
print(headlines + ": " + dates, '\n')
driver.quit()
It's that last bit of code that gets the headline and the dates. Thanks in advance for the help!
You can use csv.writer to write the data to the csv file.
Use:
with open("your_csv_file", "w") as file:
writer = csv.writer(file)
writer.writerow(["Headlines", "Dates"]) # --> Write header
for h, d in zip(headlines, dates):
writer.writerow([h, d]) # --> Write data

Extracting reviews from Google play store app website

import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
class FindByXpathCss():
# Declaring variables
Reviews = [] # List to store final set of reviews
reviewText = [] # List to store reviews extracted from XPath
reviewFullText = []
# Chromedriver path
driver = webdriver.Chrome(executable_path=r"F:\Chrome-webdriver\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
# driver.execute_script("scrollBy(0,300);")
# Scrolling down
for i in range(20):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_DOWN, i)
time.sleep(0.5)
# To click on Show more button
#btnShowMore = driver.find_element_by_xpath('//*[#id="fcxH9b"]/div[4]/c-wiz/div/div[2]''/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span').click()
# Scrolling to top
for j in range(10):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_UP, j)
#for i in range(10):
review_btn = driver.find_elements_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
single_review_btn = driver.find_element_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
#time.sleep(1)
The div html tag having 2 tags, one is having jsname as 'fbQN7e' which is there for holding the bigger reviews and those reviews will have button called "Full Review". Another one span within the same div html tag is 'bN97Pc' which is there to hold smaller reviews which wont have 'Full review' button at the end of this review. I couldn't get reviews of both types of span. Here I tried to write reviewFullText list directly to dataframe, but getting only element datatype, not text. I don't know why this too happening.
for btn in review_btn:
btn.click()
reviewFullText = driver.find_elements_by_css_selector("span[jsname='fbQN7e']")
#if(single_review_btn.is_enabled()==False):
#reviewText = driver.find_elements_by_css_selector("span[jsname=\"bN97Pc\"]")
##else:
#pass
# Iterating each reviews and appending into list Reviews
for txtreview in reviewText:
reviewFullText.append(txtreview.text)
print(len(reviewFullText))
# Writing the list values into csv file
df = pd.DataFrame(reviewFullText)
#df = pd.DataFrame({'Reviews': 'Reviews'}) #'Sentiment': 'null'})
df.to_csv('Reviews.csv', index=True, encoding='utf-8')
driver.close()
I have modified your solution to retrieve all review from the page.
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
class FindByXpathCss():
driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
scrolls = 3
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
buttonClick = WebDriverWait(driver, 30).until(
EC.visibility_of_all_elements_located((By.XPATH, "//button[contains(#class,'')][contains(text(),'Full Review')]")))
for element in buttonClick:
driver.execute_script("arguments[0].click();", element)
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
for textreview in reviewText:
print textreview.text
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
# reviewText = driver.find_elements_by_xpath("//*[#class='UD7Dzf']")
for textreview in reviewText:
print textreview.text
Output:

Categories