Python Scraping Web page - python

I tried to Scrap all the 'a' tag placeholder values from this sample link:https://www.fundoodata.com/companies-in/list-of-apparel-stores-companies-in-india-i239
What I need is I want to copy only the names of 'a' tag and need to save it to a csv file
I'm beginner can anyone help me out below is my wrong code:
# importing the modules
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import os
link = "https://www.fundoodata.com/companies-in/list-of-apparel-stores-companies-in-india-i239"
# instantiating empty lists
nameList = []
for i in range(1) :
driver = webdriver.Chrome(ChromeDriverManager().install())
# fetching all the store details
storeDetails = driver.find_elements_by_class_name('search-result')
# iterating the storeDetails
for j in range(len(storeDetails)):
# fetching the name, address and contact for each entry
name = storeDetails[j].find_element_by_class_name('heading').text
myList = []
nameList.append(name)
driver.close()
# initialize data of lists.
data = {'Company Name': nameList,}
# Create DataFrame
df = pd.DataFrame(data)
print(df)
# Save Data as .csv
df.to_csv("D:\xxx\xxx\xx\xxx\demo.csv", mode='w+', header = False)

There are lots of problems here first of which:
## to open webpage
driver.get(link)
secondly you don't need the first for loop at all. Lastly:
from selenium.webdriver.common.by import By
## find the a tags inside the search results
storedetails = driver.find_elements(By.CSS_SELECTOR, 'div.heading a')
## iterating over elements list
for name in storedetails:
## appending the a tag text
nameList.append(name.text)
I hope this helped! :)

Related

Web scraping Amazon and create a list of dic

I am experiencing some troubles with my code when scraping the Amazon site with selenium.
I want a list of dictionaries of title and author of the books as keys and values, in the format:
[{TITLE:'x', AUTHOR:'y'}
{TITLE:'z', AUTHOR:'w'}]
However it returns me a dictionary of lists, with keys and values repeated, in the format:
{TITLE:['x'], AUTHOR:['y']}
{TITLE:['x', 'z'], AUTHOR:['y', 'r']}
{TITLE:['x', 'z', 'q'], AUTHOR:['y', 'r', 'p']}
That is: it iterates and repeat the values for each key. It shows me the previous value, and includes it in the next dictionary. It is not supposed to happen. What am I doing wrong?
Here is my code:
Firstly, I import the libraries:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from time import sleep
Secondly, I install the proper version o chromedrive:
Service(ChromeDriverManager().install())
Thirdly, I open the browser automaticaly:
options = Options()
options.add_argument('window-size=250,1000')
driver = webdriver.Chrome(executable_path=r'C:\Users\dambr\Documents\scrapping\chromedriver.exe', options=options)
driver.implicitly_wait(5)
Fourtly, I open the Amazon site:
driver.get('https://www.amazon.com.br/')
a = driver.find_element(By.ID, "twotabsearchtextbox")
a.click()
a.send_keys('python')
b = driver.find_element(By.ID, "nav-search-submit-button")
b.click()
sleep(3)
Finally, I take all the titles and authors of my search and try to store in a list of dictionaries:
dic_livros = {'TÍTULO':[], 'AUTOR':[]}
lista = '//*[#id="search"]/div[1]/div[1]/div/span[1]'
for i in lista:
title = driver.find_elements(By.XPATH, "//span[#class='a-size-base-plus a-color-base a-text-normal']")
author = driver.find_elements(By.XPATH, "//span[#class='a-size-base']")
for (each_title, each_author) in zip(title, author):
dic_livros['TÍTULO'].append(each_title.text)
dic_livros['AUTOR'].append(each_author.text)
print(dic_livros)
Where, precisely is my mistake?
Here is what my output looks like:
your last step needs two changes:
replace first line with
dic_livros = []
then for the for loop:
for (each_title, each_author) in zip(title, author):
dic_livros.append({'Titulo':each_title.text,'Autor':each_author.text})

how do i webscrape using python and beautifulsoup?

Im very new to this, but I have an idea for a website and I want to give it a good go, my aim is to scrape the Asda website for prices and products, more specifically in this case whiskey. I want to grab the name and price of all the whiskey on the Asda website and put it into a nice table on my website, however I am having problems doing so, my code so far is getting syntax error, can anyone help?
the code so far is..
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
res = driver.execute_script('return document.documentElement.outerHTML')
html_soup = BeautifulSoup(res, 'html.parser')
type(html_soup)
driver.quit
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
whiskey_container = html_soup.find('div', {'class': 'co-product-lazy-container'})
for whiskey in whiskey_container:
name = whiskey.find('a', {'class': 'co-product__anchor'})
price = whiskey.find('div', {'class': 'co-product__price'})
print(name, price)
Try it:
# for wait time better than time.sleep()
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
import time # or WebDriverWait
import csv # for saving data in table
# save csv file
def save_csv(dct):
'''
dct - dictionary with our data:
"cap",
"title",
"price"
'''
name = "file.csv" # file name, it can choice what you want
print("[INFO] saving...") # for see that function works
with open(name, 'a', encoding="utf-8") as f: # open file for writing "a"
# this need for writing data to table
writer = csv.writer(f)
writer.writerow((dct['cap'],
dct['title'],
dct['price'],
))
def scroll(driver):
# for open all interesting us data
for i in range(1,6):
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0, 1000)")
time.sleep(7)
driver = webdriver.Firefox()
driver.get("https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650?facets=shelf%3A1579926650%3A0000&nutrition=&sortBy=&page=0")
for i in range(2): # 2 because we have only two page with data
element = WebDriverWait(driver, 30) # or time.sleep(30)
scroll(driver) # for open all interesting us data
# get all data to one list in beautifulsoup type
data = driver.find_elements_by_css_selector(".co-lazy-product-container .co-item")
# iterating interesting data and create dictionary with data
for d in data:
items = {}
body = d.text.split("\n")
items["cap"] = body[0]
items["title"] = body[1]
items["price"] = body[-2]
save_csv(items)
# pagination
driver.find_element_by_css_selector(".co-pagination__last-page").click()
# close driver
driver.quit()
you have syntax error, you have ")" missing :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
it should be :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
--
btw your code won't work. you have couple of logical errors.
and I doubt you can scrape that page with your current code.

How do I output a single value from a dataframe to a CSV?

I am trying to get a single value from a dataframe into a CSV file. I am trying to get information on MLB statistics and used the attached function to extract the data from MLB.com after doing some scraping. However, I only want the first row (and ideally just the team name) to be extracted to a separate CSV file. From the output below, I only want to return "Toronto Blue Jays".
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import random
import time
driver = webdriver.Chrome(executable_path='filepath/chromedriver')
driver.get('http://www.mlb.com')
stats_header_bar = driver.find_element_by_class_name('megamenu-navbar-overflow__menu-item--stats')
stats_header_bar.click()
stats_line_items = stats_header_bar.find_elements_by_tag_name('li')
stats_line_items[2].click()
hitting_season_element =
driver.find_element_by_id('st_hitting_season')
season_select = Select(hitting_season_element)
season_select.select_by_value('2015')
wait = wait = WebDriverWait(driver, 10)
team_hr_stats = wait.until(EC.visibility_of_element_located((By.ID, 'datagrid')))
print('The HR dropdown in the header was loaded successfully. The mouse will move over the element after a short delay')
normal_delay = random.normalvariate(2, 0.5)
print('Sleeping for {} seconds'.format(normal_delay))
time.sleep(normal_delay)
print('Now moving mouse...')
ActionChains(driver).move_to_element(team_hr_stats).perform()
team_hr_total = team_hr_stats.find_elements_by_tag_name('th')
team_hr_total[10].click()
data_div_1 = driver.find_element_by_id('datagrid')
data_html_1 = data_div_1.get_attribute('innerHTML')
import bs4
import requests
soup_1 = bs4.BeautifulSoup(data_html_1, 'html5lib')
def extract_stats_data(data_element):
data_html = data_element.get_attribute('innerHTML')
soup = bs4.BeautifulSoup(data_html, 'html5lib')
column_names = [t.text.replace('▼', ' ').replace('▲', ' ').strip()
for t in soup.thead.tr.findAll('th')]
row_lists = []
for row in soup.tbody.findAll('tr'):
row_lists.append([col.text for col in row.findAll('td')])
df = pd.DataFrame(row_lists, columns=column_names)
numeric_fields = ['HR']
for field in numeric_fields:
df[field] = pd.to_numeric(df[field])
return df
df = extract_stats_data(data_div_1)
df.to_csv('Filename.csv')
The screenshot you have provided shows that the data is sorted by greatest HR first. So you first need to sort your df the same way:
df = extract_stats_data(data_div_1)
team_name = df.sort_values('HR', ascending=False).iloc[0]['Team'] # With highest HR
# Write team name string to file
with open('Filename.csv', 'w') as f_output:
f_output.write(team_name)

Using Selenium to scrape a table across multiple pages when the url doesn't change

I have been trying to write a program to scrape the statistics from www.whoscored.com and create a pandas dataframe.
I have updated the code with the help of crookedleaf and this is the working code:
import time
import pandas as pd
from pandas.io.html import read_html
from pandas import DataFrame
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/PlayerStatistics/England-Premier-League-2016-2017')
summary_stats = DataFrame()
while True:
while driver.find_element_by_xpath('//*[#id="statistics-table-summary"]').get_attribute('class') == 'is-updating': # driver.find_element_by_xpath('//*[#id="statistics-table-summary-loading"]').get_attribute('style') == 'display; block;' or
time.sleep(1)
table = driver.find_element_by_xpath('//*[#id="statistics-table-summary"]')
table_html = table.get_attribute('innerHTML')
page_number = driver.find_element_by_xpath('//*[#id="currentPage"]').get_attribute('value')
print('Page ' + page_number)
df1 = read_html(table_html)[0]
summary_stats = pd.concat([summary_stats, df1])
next_link = driver.find_element_by_xpath('//*[#id="next"]')
if 'disabled' in next_link.get_attribute('class'):
break
next_link.click()
print(summary_stats)
driver.close()
Now I am trying to gather the stats from the other tabs. I am really close, but the code is not exiting the loop when it should be breaking out of it. Here is the code below:
defensive_button = driver.find_element_by_xpath('//*[#id="stage-top-player-stats-options"]/li[2]/a')
defensive_button.click()
defensive_stats = DataFrame()
while True:
while driver.find_element_by_xpath('//*[#id="statistics-table-defensive"]').get_attribute('class') == 'is-updating': # driver.find_element_by_xpath('//*[#id="statistics-table-summary-loading"]').get_attribute('style') == 'display; block;' or
time.sleep(1)
table = driver.find_element_by_xpath('//*[#id="statistics-table-defensive"]')
table_html = table.get_attribute('innerHTML')
page_number = driver.find_element_by_xpath('//*[#id="statistics-paging-defensive"]/div/input[1]').get_attribute('value')
print('Page ' + page_number)
df2 = read_html(table_html)[0]
defensive_stats = pd.concat([defensive_stats, df2])
next_link = driver.find_element_by_xpath('//*[#id="statistics-paging-defensive"]/div/dl[2]/dd[3]')
if 'disabled' in next_link.get_attribute('class'):
break
next_link.click()
print(defensive_stats)
This code loops through all the pages, but then keeps looping through the last page
You are defining your table's code outside of your loop. You are navigating to the next page, but not redefining your table and table_html elements. move them to the first lines after while True
EDIT: After making the changes to your code, my guess is due to the dynamically loaded content of the table, you it is unable to process the changes or unable to get the content due to the "loading" graphic overlay. Another thing is there may not always be 30 pages. Today, for example, there's 29, so it continuously gets the data from page 29. I modified your code to be keep running until the "next" button is no longer enabled, and i put in a wait that checks to see if the table is loading before continuing:
import time
from pandas.io.html import read_html
from pandas import DataFrame
from selenium import webdriver
driver = webdriver.Chrome(path-to-your-chromedriver)
driver.get('https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/PlayerStatistics/England-Premier-League-2016-2017')
df = DataFrame()
while True:
while driver.find_element_by_xpath('//*[#id="statistics-table-summary"]').get_attribute('class') == 'is-updating': # driver.find_element_by_xpath('//*[#id="statistics-table-summary-loading"]').get_attribute('style') == 'display; block;' or
time.sleep(1)
table = driver.find_element_by_xpath('//*[#id="statistics-table-summary"]')
table_html = table.get_attribute('innerHTML')
page_number = driver.find_element_by_xpath('//*[#id="currentPage"]').get_attribute('value')
print('Page ' + page_number)
df1 = read_html(table_html)[0]
df.append(df1)
next_link = driver.find_element_by_xpath('//*[#id="next"]')
if 'disabled' in next_link.get_attribute('class'):
break
next_link.click()
print(df)
driver.close()
However, i am getting an empty DataFrame at the end of running this. I'm unfortunately not familiar enough with pandas to identify the issue, but it is related to df.append(). I ran this through with it printing the value of df1 at each loop, and it prints the correct data, however it does not add it to the DataFrame. This may be something you are familiar enough with to implement the changes needed to run it completely.
EDIT 2: took me a while to figure this one out. essentially, the page's content is being dynamically loaded with javascript. the 'next' element you are declaring is still the first 'next' button you come across. each time you click a new tab, the amount of 'next' elements is increased. i have added in an edit that sucessfully navigates across all tabs (except the 'detailed' tab... hopefully you don't need this one lol). i, however, am still getting empty DataFrame()'s
import time
import pandas as pd
from pandas.io.html import read_html
from pandas import DataFrame
from selenium import webdriver
driver = webdriver.Chrome('/home/mdrouin/Downloads/chromedriver')
driver.get('https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/PlayerStatistics/England-Premier-League-2016-2017')
statistics = { # this is a list of all the tabs on the page
'summary': DataFrame(),
'defensive': DataFrame(),
'offensive': DataFrame(),
'passing': DataFrame()
}
count = 0
tabs = driver.find_element_by_xpath('//*[#id="stage-top-player-stats-options"]').find_elements_by_tag_name('li') # this pulls all the tab elements
for tab in tabs[:-1]: # iterate over the different tab sections
section = tab.text.lower()
driver.find_element_by_xpath('//*[#id="stage-top-player-stats-options"]').find_element_by_link_text(section.title()).click() # clicks the actual tab by using the dictionary's key (.proper() makes the first character in the string uppercase)
time.sleep(3)
while True:
while driver.find_element_by_xpath('//*[#id="statistics-table-%s"]' % section).get_attribute('class') == 'is-updating': # string formatting on the xpath to change for each section that is iterated over
time.sleep(1)
table = driver.find_element_by_xpath('//*[#id="statistics-table-%s"]' % section) # string formatting on the xpath to change for each section that is iterated over
table_html = table.get_attribute('innerHTML')
df = read_html(table_html)[0]
# print df
pd.concat([statistics[section], df])
next_link = driver.find_elements_by_xpath('//*[#id="next"]')[count] # makes sure it's selecting the correct index of 'next' items
if 'disabled' in next_link.get_attribute('class'):
break
time.sleep(5)
next_link.click()
count += 1
for df in statistics.values(): # iterates over the DataFrame() elemnts
print df
driver.quit()

Python Web Scraping: Clicking links one by one on Ajax Site

I have a list of search criteria saved in a csv file. I'd like to loop through each search criteria to generate the corresponding search results on a website. For each set of search results generated (which are links), I'd like to click into the link and then grab the data from the new page generated. Unfortunately, I am experiencing problems going into each link. If anyone could please kindly provide some insight, it would be much appreciated.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
# read list of CAS Numbers to be searched
data = pd.read_csv("NPRI CACs.csv", names=["CAS Number", "Chemical Name"])
data.dropna()
CAS = data["CAS Number"]
# Parameters to be called
url = 'http://www.lifelabs.msdss.com/Login.aspx?ReturnUrl=%2fMainMenu.aspx%3ffm%3d0%26tb%3d0'
# Sign into SafeTec
browser = webdriver.Firefox()
browser.get(url)
browser.find_element_by_class_name("text").click()
# Conduct MSDS Searches on SafeTec
for i in range(10):
try:
Ingredient_CAS_Number = browser.find_element_by_id("placeBody_dynField48_txtTextBox")
Ingredient_CAS_Number.send_keys(CAS[i])
browser.find_element_by_id("placeBody_linkSearchBottom").click()
list_links = browser.find_elements_by_css_selector("a[href*='MSDSDetail']")
links = []
for j in range(len(list_links)):
links.append(list_links[j].get_attribute('href'))
Product_Name = []
for link in links:
browser.get(link)
product = browser.find_element_by_id("placeBody_dynField1_txtTextBox")
Product_Name.append(product)
print(Product_Name)
browser.get(url)
except:
print(CAS[i])
continue
I managed to solve this with the code below. Although, the solution is a little inelegant...
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
# read list of CAS Numbers to be searched
data = pd.read_csv("NPRI CACs.csv", names=["CAS Number", "Chemical Name"])
data.dropna()
CAS = data["CAS Number"]
# Parameters to be called
url = 'http://www.lifelabs.msdss.com/Login.aspx?ReturnUrl=%2fMainMenu.aspx%3ffm%3d0%26tb%3d0'
# Sign into SafeTec
browser = webdriver.Firefox()
browser.get(url)
browser.find_element_by_class_name("text").click()
# Conduct MSDS Searches on SafeTec
for i in range(2):
Ingredient_CAS_Number = browser.find_element_by_id("placeBody_dynField48_txtTextBox")
Ingredient_CAS_Number.send_keys(CAS[i])
browser.find_element_by_id("placeBody_linkSearchBottom").click()
list_links = browser.find_elements_by_css_selector("a[href*='MSDSDetail']")
all_results = []
for j in list_links:
result = j.text
all_results.append(result)
for i in range(len(all_results)):
browser.find_element_by_link_text(all_results[i]).click()
browser.back()
browser.get(url)

Categories