I'm working with python, selenium. I'm typing a keyword which is then being searched on google. In the results section, I am trying to open the URLs one by one and storing the data of the p tag.
But in my script, it is storing data of only one site. Can anyone help on this to store the data of p tag of all the opened sites?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[#class='yuRUbf']/a[#href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
print('new url : ', new_url)
driver.get(new_url)
link_data.append(driver.page_source)
"""
Getting the data from the site
"""
content = driver.find_elements(By.TAG_NAME, "p")
for data in content:
print(data.text)
driver.back()
driver.close()
Here is the edited answer, first I misunderstood your question:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[#class='yuRUbf']/a[#href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
print('\nnew url : ', new_url)
driver.get(new_url)
#Getting the data from the site
try:
link = driver.find_elements(By.TAG_NAME, "p")
for p in link:
print(p.get_attribute("innerText"))
except:
continue
driver.quit()
Related
I use the following code to insert a code in a search bar, click a button and finally extract some information:
from selenium import webdriver
import time
from fake_useragent import UserAgent
url = 'https://www.ufficiocamerale.it/'
vat = '06655971007'
useragent = UserAgent()
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", useragent.random)
driver = webdriver.Firefox(profile)
driver.get(url)
time.sleep(5)
item = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//input[#id="search_input"]')
item.send_keys(vat)
time.sleep(1)
button = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//p//button[#type="submit"]')
button.click()
time.sleep(5)
all_items = driver.find_elements_by_xpath('//ul[#id="first-group"]/li')
for item in all_items:
if '#' in item.text:
print(item.text.split(' ')[1])
driver.close()
Now I would like to modify the code to handle the above process several times thanks to a for loop, i.e. something like this:
from selenium import webdriver
import time
from fake_useragent import UserAgent
url = 'https://www.ufficiocamerale.it/'
vats = ['06655971007', '06655971007', '01010101010']
for vat in vats:
useragent = UserAgent()
# rest of the code
but it does nothing. Where am I doing wrong? Is it the definition of user agent?
Can you be more specific about "it does nothing" please ?
Is code without loop is working fine ?
*when testing it on this web site, for "06655971007" as input, it won't write anything because no # in returned string
EDIT
from selenium import webdriver
import time
#from fake_useragent import UserAgent
url = 'https://www.ufficiocamerale.it/'
vats = ['06655971007', '06655971007', '01010101010']
for vat in vats:
#useragent = UserAgent()
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", "useragent.random")
driver = webdriver.Chrome('./chromedriver.exe')
#driver = webdriver.Firefox(profile)
driver.get(url)
time.sleep(5)
item = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//input[#id="search_input"]')
item.send_keys(vat)
time.sleep(1)
button = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//p//button[#type="submit"]')
button.click()
time.sleep(5)
all_items = driver.find_elements_by_xpath('//ul[#id="first-group"]/li')
found_it = False
for item in all_items:
if '#' in item.text:
print(vat + " = " + item.text.split(' ')[1])
found_it = True
if not found_it:
print(vat + " no email found")
driver.close()
With output like this :
01010101010 no email found
08157270961 = vince.srl#legalmail.it
06655971007 = enelenergia#pec.enel.it
I am trying to scrape products details from aliexpress. I have 2 questions. First, how do I scrape category and save it in csv file in front of each product and second, how do I move to the 2nd and other pages until there are no more pages available or until page 10.
This is the code I have written to find the next pages
from bs4 import BeautifulSoup
import requests as r
page = r.get('https://www.aliexpress.com/category/200000664/jackets.html?spm=2114.11010108.102.4.650c649b8lfPOb')
soup = BeautifulSoup(page.content,'html.parser')
content = soup.find(id="pagination-bottom")
pages = content.findAll('a')
for i in pages:
print('https:' + i.get('href'))
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup import urllib.request
filename = "alibaba.csv"
f=open(filename, "w")
headers="product_name, price, Rating \n"
f.write(headers)
class alibabascrape(object):
def __init__(self, keyword):
self.keyword = keyword
self.url = f"https://www.aliexpress.com/wholesale?catId=0&initiative_id=&SearchText={keyword}"
self.driver = webdriver.Firefox(executable_path = 'c:\geckodriver.exe')
self.delay = 3
def load_alibabalist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_all_elements_located((By.ID, "form-searchbar")))
print("page is ready")
except TimeoutException:
print("Too much Time")
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name("list-item")
post_title_list = []
for post in all_posts:
title=post.text.split("\n")
name=title[0]
print(name)
price=title[2]
print(price)
rating = title[6]
print(rating)
f.write(name + "," + price + "," + rating + "\n")
post_title_list.append(post.text)
return post_title_list
def extract_category(self):
category = self.driver.find_elements_by_class_name("col-sub")
print(category)
def extract_post_urls(self):
url_list = []
html_page = urllib.request.urlopen(self.url)
soup = BeautifulSoup(html_page, "lxml")
for link in soup.findAll("a", {"class": "history-item product"}):
print(link["href"])
url_list.append(link["href"])
return url_list
keyword = "iphone"
scrapper = alibabascrape(keyword)
scrapper.load_alibabalist_url()
scrapper.extract_post_information()
scrapper.extract_category()
scrapper.extract_post_urls()
I can help you with pagination:
If you get all ref links then you can simply use for loop to iterate
all links.
If you just have prev or next page link. Then use while/ do while loop to check if the link exists and then click on it.
scraping contact information from the directory site
I am scraping contact information from the directory site.
this is not a link
I need scrape by selenium. it needs 3 steps,
1. get the company url from website.
2. get all company url from next page/ all pages.
3. scrape all contact information such as company name, website, email. etc.
the code as below, but I face two problem.
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
results = list()
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
MAX_PAGE_NUM = 2
for i in range(1, MAX_PAGE_NUM):
page_num = str(i)
url ="http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control/" + page_num
driver.get(url)
sleep(5)
sel = Selector(text=driver.page_source)
companies = sel.xpath('//*[#id="categorypagehtml"]/div[1]/div[7]/ul/li/b//#href').extract()
for i in range(0, len(companies)):
print(companies[i])
results.append(companies[i])
print('---')
for result in results:
url1 = "http://www.arabianbusinesscommunity.com" +result
print(url1)
driver.get(url1)
sleep(5)
sel = Selector(text=driver.page_source)
name = sel.css('h2::text').extract_first()
country = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[1]/span[4]/text()').extract_first()
if country:
country = country.strip()
web = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[4]/a/#href').extract_first()
email = sel.xpath('//a[contains(#href, "mailto:")]/#href').extract_first()
records = []
records.append((web,email,country,name))
df = pd.DataFrame(records, columns=['web','email', 'country', 'name'])
I write the code as above, but I have two problem.
1. I only can get the last company information.
2.each time it is iteration from the loop, computer always click all urls that clicked before.
can anyone help solve the problem?
Here code to get all companies details from all pages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseUrl = "http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control"
driver.get(baseUrl)
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
# Get last page number
lastPageHref = driver.find_element(By.CSS_SELECTOR, ".PagedList-skipToLast a").get_attribute("href")
hrefArray = lastPageHref.split("/")
lastPageNum = int(hrefArray[len(hrefArray) - 1])
# Get all URLs for the first page and save them in companyUrls list
js = 'return [...document.querySelectorAll(".search-result-list li b a")].map(e=>e.href)'
companyUrls = driver.execute_script(js)
# Iterate through all pages and get all companies URLs
for i in range(2, lastPageNum):
driver.get(baseUrl + "/" + str(i))
companyUrls.extend(driver.execute_script(js))
# Open each company page and get all details
companies = []
for url in companyUrls:
driver.get(url)
company = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#companypagehtml")))
name = company.find_element_by_css_selector("h2").text
email = driver.execute_script('var e = document.querySelector(".email"); if (e!=null) { return e.textContent;} return "";')
website = driver.execute_script('var e = document.querySelector(".website"); if (e!=null) { return e.textContent;} return "";')
phone = driver.execute_script('var e = document.querySelector(".phone"); if (e!=null) { return e.textContent;} return "";')
fax = driver.execute_script('var e = document.querySelector(".fax"); if (e!=null) { return e.textContent;} return "";')
country = company.find_element_by_xpath(".//li[#class='location']/span[last()]").text.replace(",", "").strip()
address = ''.join([e.text.strip() for e in company.find_elements_by_xpath(".//li[#class='location']/span[position() != last()]")])
I'm making a Craigslist scraper to scrape the titles, prices, date, and URL and exported that info to a CSV. Now, I want Selenium to click on the post URL to navigate to the actual page, parse the page to get a span tag "Odometer" (to get mileage), and return that to my CSV file.
Here's my code so far:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import schedule
from bs4 import BeautifulSoup
import urllib.request
import csv
import pandas as pd
class CraigslistScaper(object):
def __init__(self,query,location,max_price,transmission):
self.query = query
# self.sort=sort
self.location = location
# self.postal = postal
self.max_price = max_price
self.transmission = auto_transmission
#https://sfbay.craigslist.org/search/cta?query=mazda+miata&sort=rel&max_price=6000&auto_transmission=1
self.url = "https://{}.craigslist.org/search/cta?query={}&sort=rel&max_price={}&auto_transmission={}".format(self.location, self.query, self.max_price, self.transmission)
self.driver = webdriver.Chrome('/Users/MyUser/Desktop/chromedriver')
self.delay = 5
def load_craigslist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID,"searchform")))
print("page is ready")
except TimeoutError:
print('Loading took too much time')
#extracting the post information such as titles, dates, and prices
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name('result-row')
titles = []
dates = []
prices = []
post_info_list = []
for i in range(len(all_posts)):
post = all_posts[i]
title = post.text.split('$')
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(' ')
month = title[0]
day = title[1]
date = month + " " + day
title = ' '.join(title[2:])
#print('PRICE: ' + (price))
#print('TITLE: ' + (title))
#print('DATE: ' + date)
lst = [price, title, date]
post_info_list.append(lst)
#f=open("miata_prices.csv", "a+")
#f.write(post_info_list)
#print(post_info_list)
#df = pd.DataFrame(post_info_list)
#df.to_csv('miata_prices.csv', index=False, header=False)
print(post_info_list)
return post_info_list
def save_post_info_and_urls_to_csv(self, post_info, post_urls):
for i in range(len(post_info)):
post_info[i].append(post_urls[i])
#print(post_info)
df = pd.DataFrame(post_info)
df.to_csv('miata_prices.csv', index=False, header=False)
return post_info
#extracting post URLs
def extract_post_urls(self):
url_list = []
soup = BeautifulSoup(self.driver.page_source,'html.parser')
aTagsInLi = self.driver.find_elements_by_css_selector('li a')
self.driver.find_elements_by_css_selector('li a')[0].click()
for a in aTagsInLi:
link = a.get_attribute('href')
print(link)
link = self.driver.find_element_by_link_text('Miata')
print(link)
link.click()
for link in soup.findAll('a', {'class': "result-title hdrlnk"}):
#print(link.get('href'))
url_list.append(link.get('href'))
return url_list
#to click on URL Links and parse the HTML
def click_next_page(self):
href = driver.find_element_by_partial_link_text("result-title hdrlink")
extract_post_urls(url_list).click(href)
def quit(self):
self.driver.close()
location = "sfbay"
max_price = "5000"
#radius = "250"
auto_transmission = 1
query = "Mazda Miata"
scraper = CraigslistScaper(query,location,max_price,auto_transmission)
scraper.load_craigslist_url()
post_info = scraper.extract_post_information()
#print(post_info)
post_urls = scraper.extract_post_urls()
#print(post_urls)
scraper.save_post_info_and_urls_to_csv(post_info, post_urls)
#print(post_info)
scraper.quit()
I manage to get everything to the CSV file, but I'm stuck on how I can get Selenium to open every link in a new tab, get the odometer information, then close the tab.
I'm using this to build a dataset and eventually do some analysis with it!
I have an example how to get Selenium to open every link and get the odometer information. I used a wrapper for Selenium (SeElements) for less code. I hope you will found out how it works. So:
I'm opening your link, scrapping all links from the titles to the list. Then open every link and trying to get odometer info.
from elementium.drivers.se import SeElements
from selenium import webdriver
browser = webdriver.Chrome()
url = 'https://sfbay.craigslist.org/search/ctaquery=mazda+miata&sort=rel&max_price=6000&auto_transmission=1'
browser.get(url)
se = SeElements(browser)
titles = se.xpath('//p[#class="result-info"]/a', wait=True, ttl=5)
try:
links = []
for link in titles:
links.append(link.attribute('href'))
for link in links:
print(link)
browser.get(link)
try:
odometer = se.xpath('//span[contains(text(), "odometer")]',wait=True, ttl=2).text()
except Exception:
continue
print(odometer)
except Exception as e:
browser.quit()
raise e
I have code as below to scrape site and it is no problems, then I want to only use Selenium so I change code to this, then I got errors, I don't know why, does anyone help me?
webdriver.PhantomJS() Errors
Exception: Message: {"errorMessage":"Element does not exist in cache"
webdriver.Chrome() Error:
Exception: Message: stale element reference: element is not attached to the page document
Selenium only code
driver = webdriver.Chrome() # or webdriver.PhantomJS()
a = driver.find_elements_by_css_selector(findTag + "." + findValue + " a")
img = driver.find_elements_by_css_selector(findTag + "#" + findValue + "img")
href = a.get_attribute('href')
src = img.get_attribute("src")
Selenium + BeautifulSoup code:
driver = webdriver.Chrome() # or webdriver.PhantomJS()
soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
a = soup.find(findTag, class_=findValue).find_all("a")
img = soup.find(findTag, id=findValue).find_all("img")
href = a.get("href")
src = img.get("src")
Have you tried to implement waits? It would go as follow:
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome() # or webdriver.PhantomJS()
# Here check that your image is in the page's document.
wait = driver.WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.ID, "YourImgId")))
# Now try to find it in the DOM
img = driver.find_elements_by_css_selector(findTag + "#" + findValue + "img")
a = driver.find_elements_by_css_selector(findTag + "." + findValue + " a")
href = a.get_attribute('href')
src = img.get_attribute("src")
Hope this helps :)
About waits: http://selenium-python.readthedocs.io/waits.html
Edit: not a wait issue
Just navigate to the page with selenium, enter your credential and then use beautifulsoup to scrape the page. It should then be fine :)
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
ex_path = r"C:\chromedriver_win32\chromedriver.exe"
# Going to the link
driver = webdriver.Chrome(executable_path = ex_path)
driver.get("http://ipcamera-viewer.com/view/?camera_code=199619")
# Enter the password
code = driver.find_element_by_name("pass")
code.send_keys("5042")
code.send_keys(Keys.ENTER)
# Now get the soup
soup = BeautifulSoup(driver.page_source, "html.parser")
element_ = soup.find("ul", id = "grid")
images_links = []
for img in element_.find_all("img"):
images_links.append(img.get("src"))
print images_links[0:2]
Output:
>>> [u'http://ipcamera-viewer.com/image/?p=199619_20170301_201334_5668.jpg', u'http://ipcamera-viewer.com/image/?p=199619_20170301_201329_5611.jpg']