I have a list of search criteria saved in a csv file. I'd like to loop through each search criteria to generate the corresponding search results on a website. For each set of search results generated (which are links), I'd like to click into the link and then grab the data from the new page generated. Unfortunately, I am experiencing problems going into each link. If anyone could please kindly provide some insight, it would be much appreciated.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
# read list of CAS Numbers to be searched
data = pd.read_csv("NPRI CACs.csv", names=["CAS Number", "Chemical Name"])
data.dropna()
CAS = data["CAS Number"]
# Parameters to be called
url = 'http://www.lifelabs.msdss.com/Login.aspx?ReturnUrl=%2fMainMenu.aspx%3ffm%3d0%26tb%3d0'
# Sign into SafeTec
browser = webdriver.Firefox()
browser.get(url)
browser.find_element_by_class_name("text").click()
# Conduct MSDS Searches on SafeTec
for i in range(10):
try:
Ingredient_CAS_Number = browser.find_element_by_id("placeBody_dynField48_txtTextBox")
Ingredient_CAS_Number.send_keys(CAS[i])
browser.find_element_by_id("placeBody_linkSearchBottom").click()
list_links = browser.find_elements_by_css_selector("a[href*='MSDSDetail']")
links = []
for j in range(len(list_links)):
links.append(list_links[j].get_attribute('href'))
Product_Name = []
for link in links:
browser.get(link)
product = browser.find_element_by_id("placeBody_dynField1_txtTextBox")
Product_Name.append(product)
print(Product_Name)
browser.get(url)
except:
print(CAS[i])
continue
I managed to solve this with the code below. Although, the solution is a little inelegant...
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
# read list of CAS Numbers to be searched
data = pd.read_csv("NPRI CACs.csv", names=["CAS Number", "Chemical Name"])
data.dropna()
CAS = data["CAS Number"]
# Parameters to be called
url = 'http://www.lifelabs.msdss.com/Login.aspx?ReturnUrl=%2fMainMenu.aspx%3ffm%3d0%26tb%3d0'
# Sign into SafeTec
browser = webdriver.Firefox()
browser.get(url)
browser.find_element_by_class_name("text").click()
# Conduct MSDS Searches on SafeTec
for i in range(2):
Ingredient_CAS_Number = browser.find_element_by_id("placeBody_dynField48_txtTextBox")
Ingredient_CAS_Number.send_keys(CAS[i])
browser.find_element_by_id("placeBody_linkSearchBottom").click()
list_links = browser.find_elements_by_css_selector("a[href*='MSDSDetail']")
all_results = []
for j in list_links:
result = j.text
all_results.append(result)
for i in range(len(all_results)):
browser.find_element_by_link_text(all_results[i]).click()
browser.back()
browser.get(url)
Related
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
URL = 'https://mergr.com/firms/search/employees?page=1&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'
driver.get(URL)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
urls=[]
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details=soup.select("tbody tr")
for detail in details:
try:
t1 =detail.select_one("h5.profile-title a").text
except:
pass
wev={
'Name':t1,
}
product.append(wev)
page_links =driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
data={
'website':website
}
product.append(data)
df=pd.DataFrame(product)
df.to_csv('firm.csv')
The data of the website will be down in to CSV file as shown in pic is I am appending the data in wrong way why is data moving down where I am wrong ...Kindly recommend where I am wrong there .......
I want output in these format Kindly suggest solution for these...I want output in these format as you shown below...
You can't append wev and data separately - you need website and name in the same dictionary for pandas to know that they belong to same row.
You could add the websites in a separate list like
sites = []
# for url in urls:
# driver.get...
# soup = ....
# try:....except:....
data={
'website':website
}
sites.append(data)
and then zip and combine:
for pi, dictPair in enumerate(zip(product, sites)):
product[pi].update(dictPair[1])
df = pd.DataFrame(product)
df.to_csv('firm.csv')
However, I don't think it's the best way to make sure the right Names and Websites are matched up.
You should just add to the same dictionary for each row from the start instead of zipping and merging.
added_urls = []
product = []
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')
I tried to Scrap all the 'a' tag placeholder values from this sample link:https://www.fundoodata.com/companies-in/list-of-apparel-stores-companies-in-india-i239
What I need is I want to copy only the names of 'a' tag and need to save it to a csv file
I'm beginner can anyone help me out below is my wrong code:
# importing the modules
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import os
link = "https://www.fundoodata.com/companies-in/list-of-apparel-stores-companies-in-india-i239"
# instantiating empty lists
nameList = []
for i in range(1) :
driver = webdriver.Chrome(ChromeDriverManager().install())
# fetching all the store details
storeDetails = driver.find_elements_by_class_name('search-result')
# iterating the storeDetails
for j in range(len(storeDetails)):
# fetching the name, address and contact for each entry
name = storeDetails[j].find_element_by_class_name('heading').text
myList = []
nameList.append(name)
driver.close()
# initialize data of lists.
data = {'Company Name': nameList,}
# Create DataFrame
df = pd.DataFrame(data)
print(df)
# Save Data as .csv
df.to_csv("D:\xxx\xxx\xx\xxx\demo.csv", mode='w+', header = False)
There are lots of problems here first of which:
## to open webpage
driver.get(link)
secondly you don't need the first for loop at all. Lastly:
from selenium.webdriver.common.by import By
## find the a tags inside the search results
storedetails = driver.find_elements(By.CSS_SELECTOR, 'div.heading a')
## iterating over elements list
for name in storedetails:
## appending the a tag text
nameList.append(name.text)
I hope this helped! :)
Im very new to this, but I have an idea for a website and I want to give it a good go, my aim is to scrape the Asda website for prices and products, more specifically in this case whiskey. I want to grab the name and price of all the whiskey on the Asda website and put it into a nice table on my website, however I am having problems doing so, my code so far is getting syntax error, can anyone help?
the code so far is..
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
res = driver.execute_script('return document.documentElement.outerHTML')
html_soup = BeautifulSoup(res, 'html.parser')
type(html_soup)
driver.quit
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
whiskey_container = html_soup.find('div', {'class': 'co-product-lazy-container'})
for whiskey in whiskey_container:
name = whiskey.find('a', {'class': 'co-product__anchor'})
price = whiskey.find('div', {'class': 'co-product__price'})
print(name, price)
Try it:
# for wait time better than time.sleep()
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
import time # or WebDriverWait
import csv # for saving data in table
# save csv file
def save_csv(dct):
'''
dct - dictionary with our data:
"cap",
"title",
"price"
'''
name = "file.csv" # file name, it can choice what you want
print("[INFO] saving...") # for see that function works
with open(name, 'a', encoding="utf-8") as f: # open file for writing "a"
# this need for writing data to table
writer = csv.writer(f)
writer.writerow((dct['cap'],
dct['title'],
dct['price'],
))
def scroll(driver):
# for open all interesting us data
for i in range(1,6):
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0, 1000)")
time.sleep(7)
driver = webdriver.Firefox()
driver.get("https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650?facets=shelf%3A1579926650%3A0000&nutrition=&sortBy=&page=0")
for i in range(2): # 2 because we have only two page with data
element = WebDriverWait(driver, 30) # or time.sleep(30)
scroll(driver) # for open all interesting us data
# get all data to one list in beautifulsoup type
data = driver.find_elements_by_css_selector(".co-lazy-product-container .co-item")
# iterating interesting data and create dictionary with data
for d in data:
items = {}
body = d.text.split("\n")
items["cap"] = body[0]
items["title"] = body[1]
items["price"] = body[-2]
save_csv(items)
# pagination
driver.find_element_by_css_selector(".co-pagination__last-page").click()
# close driver
driver.quit()
you have syntax error, you have ")" missing :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650'
it should be :
response = requests.get('https://groceries.asda.com/shelf/drinks/spirits-ready-to-drink/spirits/whisky/1579926650')
--
btw your code won't work. you have couple of logical errors.
and I doubt you can scrape that page with your current code.
I am scraping one page but the problem i came up today was that the page didn`t have another page and it gave me the previous page without any error from which i could determine that page was last one..
for ex: https://example/page-7
when i want to go to: https://example/page-8 which doesn`t exist it gives me
the last page: https://example/page-7
How could i determine that https://example/page-7 was the last page using python3???
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-1"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
for j in range(100):
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page{0}".format(j+2)
driver.get(page)
dd = driver.page_source
At first i was thinking about checking dublicates of collected data but this is too slow cause i have 30 000 links from which i have to collect data. Maybe there is easier solution??
Found the answer to my own question.
To find the page url just use driver.current_url
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
current_pages = []
for j in range(100):
page_url = driver.current_url
if(page_url not in current_pages):
current_pages.append(page_url)
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-{0}".format(j+2)
driver.get(page)
dd = driver.page_source
else:
print(current_pages)
driver.quit()
break
I wrote some code to extract Insider Trading information from the Toronto Stock Exchange. Using Selenium, open up this link and then, using a list of stocks, one by one input each into the form, retrieve the data and put it into another list, then do the same for the next stock.
Here is the code:
from selenium import webdriver
stocks = ['RKN','MG','GTE','IMO','REI.UN','RY']
dt = []
url = 'https://app.tmxmoney.com/research/insidertradesummaries?locale=EN'
driver = webdriver.Firefox()
driver.get(url)
search = driver.find_element_by_xpath('//ul[#class="nav nav-pills"]/li[3]')
search.click()
stock_form = driver.find_element_by_name('text')
for stock in stocks:
stock_form.clear()
stock_form.send_keys(stock)
stock_form.submit()
data = driver.find_element_by_xpath('//div[#class="insider-trades-symbol-search-container"]/div[#class="ng-binding"]')
a = data.text.split('\n')
if len(a) > 1:
dt.append(a[-1].split())
else:
dt.append([])
driver.close()
If you run the code, you can see each stock being input into the form, the data will pop up and I attempt to retrieve it. However, when I get the text from "data", its as if its retrieved from what was visible on the page prior to submitting the form. I tried adding a wait to the code to no avail.
added a time.sleep(1) and the code works as intended.
from selenium import webdriver
import time
stocks = ['RKN','MG','GTE','IMO','REI.UN','RY']
dt = []
url = 'https://app.tmxmoney.com/research/insidertradesummaries?locale=EN'
driver = webdriver.Firefox()
driver.get(url)
search = driver.find_element_by_xpath('//ul[#class="nav nav-pills"]/li[3]')
search.click()
stock_form = driver.find_element_by_name('text')
for stock in stocks:
stock_form.clear()
stock_form.send_keys(stock)
stock_form.submit()
data = driver.find_element_by_xpath('//div[#class="insider-trades-symbol-search-container"]/div[#class="ng-binding"]')
a = data.text.split('\n')
**time.sleep(1)**
if len(a) > 1:
dt.append(a[-1].split())
else:
dt.append([])
driver.close()