How to scrape data on a subpage of a website? - python

Here's the website : website
And here's my script :
from selenium import webdriver
import time
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium.webdriver.common.keys import Keys
#chemin du folder ou vous avez placer votre chromedriver
PATH = "driver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
#driver = webdriver.Chrome(options=options, executable_path=PATH)
url = 'https://www.booking.com/hotel/fr/d-argentine.fr.html?label=gen173nr-1DEgdyZXZpZXdzKIICOOgHSDNYBGhNiAEBmAENuAEXyAEM2AED6AEBiAIBqAIDuAKr2vuGBsACAdICJDE1YjBlZDY1LTI2NzEtNGM3Mi04OWQ1LWE5MjQ3OWFmNzE2NtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1625222475;srpvid=48244b2523010057;type=total;ucfs=1&#tab-main'
driver = webdriver.Chrome(options=options, executable_path=PATH)
driver.get(url)
driver.maximize_window()
time.sleep(2)
headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
cookie = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
try:
cookie.click()
except:
pass
time.sleep(2)
country = driver.find_element_by_xpath('//*[#class="hp_nav_reviews_link toggle_review track_review_link_zh"]')
country.click()
time.sleep(2)
url2 = driver.current_url
commspos = []
commsneg = []
header = []
notes = []
dates = []
datestostay = []
results = requests.get(url2, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
reviews = soup.find_all('li', class_ = "review_item clearfix")
for review in reviews:
try:
commpos = review.find("p", class_ = "review_pos").text.strip()
except:
commpos = 'NA'
commspos.append(commpos)
try:
commneg = review.find("p", class_ = "review_neg").text.strip()
except:
commneg = 'NA'
commsneg.append(commneg)
head = review.find('div', class_ = 'review_item_header_content').text.strip()
header.append(head)
note = review.find('span', class_ = 'review-score-badge').text.strip()
notes.append(note)
date = review.find('p', class_ = 'review_item_date').text[23:].strip()
dates.append(date)
try:
datestay = review.find('p', class_ = 'review_staydate').text[20:].strip()
datestostay.append(datestay)
except:
datestostay.append('NaN')
data = pd.DataFrame({
'commspos' : commspos,
'commsneg' : commsneg,
'headers' : header,
'notes' : notes,
'dates' : dates,
'datestostay' : datestostay,
})
data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')
My script goes here :
But my csv file as output is empty. I assume it has to do with some kind of javascript. I already encounter that, the script goes to the subpage but isn't really into the subpage, hence doesn't have access to the html part and give nothing in the output.

That sub-page that you want loads the data from this URL.
https://www.booking.com/hotelfeaturedreviews/fr/d-argentine.fr.html?label=gen173nr-1DEgdyZXZpZXdzKIICOOgHSDNYBGhNiAEBmAENuAEXyAEM2AED6AEBiAIBqAIDuAKr2vuGBsACAdICJDE1YjBlZDY1LTI2NzEtNGM3Mi04OWQ1LWE5MjQ3OWFmNzE2NtgCBOACAQ;sid=22417257c7da25395d270bcc7c6ec2e8;dest_id=-1456928;dest_type=city;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1625222475;srpvid=48244b2523010057;type=total;ucfs=1&&_=1625476042625
You can easily scrape this page and extract the data you need.

Related

Append data wrong in csv file

from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
URL = 'https://mergr.com/firms/search/employees?page=1&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'
driver.get(URL)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
urls=[]
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details=soup.select("tbody tr")
for detail in details:
try:
t1 =detail.select_one("h5.profile-title a").text
except:
pass
wev={
'Name':t1,
}
product.append(wev)
page_links =driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
data={
'website':website
}
product.append(data)
df=pd.DataFrame(product)
df.to_csv('firm.csv')
The data of the website will be down in to CSV file as shown in pic is I am appending the data in wrong way why is data moving down where I am wrong ...Kindly recommend where I am wrong there .......
I want output in these format Kindly suggest solution for these...I want output in these format as you shown below...
You can't append wev and data separately - you need website and name in the same dictionary for pandas to know that they belong to same row.
You could add the websites in a separate list like
sites = []
# for url in urls:
# driver.get...
# soup = ....
# try:....except:....
data={
'website':website
}
sites.append(data)
and then zip and combine:
for pi, dictPair in enumerate(zip(product, sites)):
product[pi].update(dictPair[1])
df = pd.DataFrame(product)
df.to_csv('firm.csv')
However, I don't think it's the best way to make sure the right Names and Websites are matched up.
You should just add to the same dictionary for each row from the start instead of zipping and merging.
added_urls = []
product = []
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')

"selenium.common.exceptions.TimeoutException: Message: " Why am I getting this error message?

from selenium import webdriver
from lxml import html, etree
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium. webdriver. chrome. options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selectorlib import Extractor
import os
from datetime import date
import shutil
import requests
import json
#hey
# os.system("cat banner.txt")
today = date.today() #get todays date for the output file
date = today.strftime("%b-%d-%Y")
print('\n')
search_query = input('Enter an item: ')
chrome_options = Options()
chrome_options. add_experimental_option("detach", True)
def search_amazon(item):
s=Service('V:\Python Project\chromedriver_win32\chromedriver.exe')
driver = webdriver.Chrome(service=s)
# driver = webdriver.Edge(service=s)
driver.get('https://www.amazon.com')
search_box = driver.find_element(By.ID, "twotabsearchtextbox")
# search_button = search_box.find_element(By.CLASS_NAME,"nav-search-submit-text").click()
search_button = driver.find_element(By.CLASS_NAME, "nav-search-submit-text").click()
driver.maximize_window() # For maximizing window
driver.implicitly_wait(50)
driver.implicitly_wait(5)
try:
num_page = driver.find_element(By.CLASS_NAME,"a-pagination")
except NoSuchElementException:
num_page = driver.find_element(By.CLASS_NAME,"a-last").click()
driver.implicitly_wait(3)
url_list = []
for i in range(int(num_page.text)):
page_ = i + 1
url_list.append(driver.current_url)
driver.implicitly_wait(4)
click_next = driver.find_element(By.CLASS_NAME,'a-last').click()
print("Page " + str(page_) + " grabbed")
driver.quit()
with open('search_results_urls.txt', 'w') as filehandle:
for result_page in url_list:
filehandle.write('%s\n' % result_page)
print("---DONE GRABBING LINKS---")
def scrape(url):
headers = {
'dnt': '1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.amazon.com/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
# Download the page using requests
print("Downloading %s"%url)
r = requests.get(url, headers=headers)
# Simple check to check if page was blocked (Usually 503)
if r.status_code > 500:
if "To discuss automated access to Amazon data please contact" in r.text:
print("Page %s was blocked by Amazon. Please try using better proxies\n"%url)
else:
print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code))
return None
# Pass the HTML of the page and create
return e.extract(r.text)
search_amazon(search_query) # <------ search query goes here.
# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('search_results.yml')
# product_data = []
output_file = open('{}_{}_results.jsonl'.format(search_query,date), "w+")
destination = 'results'
with open("search_results_urls.txt",'r') as urllist, open('{}_{}_results.jsonl'.format(search_query,date),'w') as outfile:
for url in urllist.read().splitlines():
data = scrape(url)
if data:
for product in data['products']:
product['search_url'] = url
print("Saving Product: %s"%product['title'].encode('utf8'))
json.dump(product,outfile)
outfile.write("\n")
# sleep(5)
new_path = shutil.move('{}_{}_results.jsonl'.format(search_query, date), destination)
print("---DONE---")
print('\n')
To the above code I am getting the below error messages:
Getting error on line:-
items=wait(driver,30).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "s-result-item s-asin")))
which says:-
selenium.common.exceptions.TimeoutException: Message:
search_results.yml File:
products:
css: 'div[data-component-type="s-search-result"]'
xpath: null
multiple: true
type: Text
children:
title:
css: 'h2 a.a-link-normal.a-text-normal'
xpath: null
type: Text
url:
css: 'h2 a.a-link-normal.a-text-normal'
xpath: null
type: Link
rating:
css: 'div.a-row.a-size-small span:nth-of-type(1)'
xpath: null
type: Attribute
attribute: aria-label
reviews:
css: 'div.a-row.a-size-small span:nth-of-type(2)'
xpath: null
type: Attribute
attribute: aria-label
price:
css: 'span.a-price:nth-of-type(1) span.a-offscreen'
xpath: null
type: Text
search_results.txt
https://www.amazon.com/s?k=Macbook+Pro&ref=nb_sb_noss
https://www.amazon.com/s?k=Macbook+Pro&page=2&qid=1601905266&ref=sr_pg_1
https://www.amazon.com/s?k=Macbook+Pro&page=3&qid=1601905268&ref=sr_pg_2
https://www.amazon.com/s?k=Macbook+Pro&page=4&qid=1601905269&ref=sr_pg_3
https://www.amazon.com/s?k=Macbook+Pro&page=5&qid=1601905269&ref=sr_pg_4
https://www.amazon.com/s?k=Macbook+Pro&page=6&qid=1601905270&ref=sr_pg_5
The code above is for scraping the data from Amazon website using Selenium, to collect the data by entering any particular product name.
I tried most of the solutions by surfing from the internet and Youtube like adding '(By.)' for ID, and Class and so on, but nothing gave the required solution.
Please can anyone provide me with the required solution?.

Selenium doesn't return all elements required

I'm trying to get a bunch of links of the houses from this website but it only returns only about 9 elements even though it has more elements. I also tried using Beautiful Soup but the same thing happens and it doesn't return all elements.
With Selenium:
for i in range(10):
time.sleep(1)
scr1 = driver.find_element_by_xpath('//*[#id="search-page-list-container"]')
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scr1)
link_tags = driver.find_elements_by_css_selector(".list-card-info a")
links = [link.get_attribute("href") for link in link_tags]
pprint(links)
With bs4:
headers = {
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/74.0.3729.131 Safari/537.36'
}
response = requests.get(ZILLOW_URL, headers=headers)
website_content = response.text
soup = BeautifulSoup(website_content, "html.parser")
link_tags = soup.select(".list-card-info a")
link_list = [link.get("href") for link in link_tags]
pprint(link_list)
Output:
'https://www.zillow.com/b/407-fairmount-ave-oakland-ca-9NTzMK/',
'https://www.zillow.com/homedetails/1940-Buchanan-St-A-San-Francisco-CA-94115/15075413_zpid/',
'https://www.zillow.com/homedetails/2380-California-St-QZ6SFATJK-San-Francisco-CA-94115/2078197750_zpid/',
'https://www.zillow.com/homedetails/5687-Miles-Ave-Oakland-CA-94618/299065263_zpid/',
'https://www.zillow.com/b/olume-san-francisco-ca-65f3Yr/',
'https://www.zillow.com/homedetails/29-Balboa-St-APT-1-San-Francisco-CA-94118/2092859824_zpid/']
Is there any way to tackle this problem? I would really appreciate the help.
You have to scroll to each element one by one in a loop and then have to look for descendant anchor tag which has the href.
driver.maximize_window()
#driver.implicitly_wait(30)
wait = WebDriverWait(driver, 50)
driver.get("https://www.zillow.com/homes/for_rent/1-_beds/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-123.7956336875%2C%22east%22%3A-121.6368202109375%2C%22south%22%3A37.02044483468766%2C%22north%22%3A38.36482775108166%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A872627%7D%2C%22beds%22%3A%7B%22min%22%3A1%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22mp%22%3A%7B%22max%22%3A3000%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22fr%22%3A%7B%22value%22%3Atrue%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A9%7D")
j = 1
for i in range(len(driver.find_elements(By.XPATH, "//article"))):
all_items = driver.find_element_by_xpath(f"(//article)[{j}]")
driver.execute_script("arguments[0].scrollIntoView(true);", all_items)
print(all_items.find_element_by_xpath('.//descendant::a').get_attribute('href'))
j = j + 1
time.sleep(2)
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Output :
https://www.zillow.com/b/bay-village-vallejo-ca-5XkKWj/
https://www.zillow.com/b/waterbend-apartments-san-francisco-ca-9NLgqG/
https://www.zillow.com/b/the-verdant-apartments-san-jose-ca-5XsGhW/
https://www.zillow.com/homedetails/1539-Lincoln-Ave-San-Rafael-CA-94901/80743209_zpid/
https://www.zillow.com/b/the-crossing-at-arroyo-trail-livermore-ca-5XjR44/
https://www.zillow.com/homedetails/713-Trancas-St-APT-4-Napa-CA-94558/2081608744_zpid/
https://www.zillow.com/b/americana-apartments-mountain-view-ca-5hGhMy/
https://www.zillow.com/b/jackson-arms-apartments-hayward-ca-5XxZLv/
https://www.zillow.com/b/elan-at-river-oaks-san-jose-ca-5XjLQF/
https://www.zillow.com/homedetails/San-Francisco-CA-94108/2078592726_zpid/
https://www.zillow.com/homedetails/20914-Cato-Ct-Castro-Valley-CA-94546/2068418792_zpid/
https://www.zillow.com/homedetails/1240-21st-Ave-3-San-Francisco-CA-94122/2068418798_zpid/
https://www.zillow.com/homedetails/1246-Walker-Ave-APT-207-Walnut-Creek-CA-94596/18413629_zpid/
https://www.zillow.com/b/the-presidio-fremont-ca-5Xk3QQ/
https://www.zillow.com/homedetails/1358-Noe-St-1-San-Francisco-CA-94131/2068418857_zpid/
https://www.zillow.com/b/the-estates-at-park-place-fremont-ca-5XjVpg/
https://www.zillow.com/homedetails/2060-Camel-Ln-Walnut-Creek-CA-94596/2093645611_zpid/
https://www.zillow.com/b/840-van-ness-san-francisco-ca-5YCwMj/
https://www.zillow.com/homedetails/285-Grand-View-Ave-APT-6-San-Francisco-CA-94114/2095256302_zpid/
https://www.zillow.com/homedetails/929-Oak-St-APT-3-San-Francisco-CA-94117/2104800238_zpid/
https://www.zillow.com/homedetails/420-N-Civic-Dr-APT-303-Walnut-Creek-CA-94596/18410162_zpid/
https://www.zillow.com/homedetails/1571-Begen-Ave-Mountain-View-CA-94040/19533010_zpid/
https://www.zillow.com/homedetails/145-Woodbury-Cir-D-Vacaville-CA-95687/2068419093_zpid/
https://www.zillow.com/b/trinity-towers-apartments-san-francisco-ca-5XjPdR/
https://www.zillow.com/b/hidden-creek-vacaville-ca-5XjV3h/
https://www.zillow.com/homedetails/19-Belle-Ave-APT-7-San-Anselmo-CA-94960/2081212106_zpid/
https://www.zillow.com/homedetails/1560-Jackson-St-APT-11-Oakland-CA-94612/2068419279_zpid/
https://www.zillow.com/homedetails/1465-Marchbanks-Dr-APT-2-Walnut-Creek-CA-94598/18382713_zpid/
https://www.zillow.com/homedetails/205-Morning-Sun-Ave-B-Mill-Valley-CA-94941/2077904048_zpid/
https://www.zillow.com/homedetails/1615-Pacific-Ave-B-Alameda-CA-94501/2073535331_zpid/
https://www.zillow.com/homedetails/409-S-5th-St-1-San-Jose-CA-95112/2078856409_zpid/
https://www.zillow.com/homedetails/5635-Anza-St-P5G3CZYNW-San-Francisco-CA-94121/2068419581_zpid/
https://www.zillow.com/b/407-fairmount-ave-oakland-ca-9NTzMK/
https://www.zillow.com/homedetails/1940-Buchanan-St-A-San-Francisco-CA-94115/15075413_zpid/
https://www.zillow.com/homedetails/2380-California-St-QZ6SFATJK-San-Francisco-CA-94115/2078197750_zpid/
https://www.zillow.com/homedetails/1883-Agnew-Rd-UNIT-241-Santa-Clara-CA-95054/79841436_zpid/
https://www.zillow.com/b/marina-playa-santa-clara-ca-5XjKBc/
https://www.zillow.com/b/birch-creek-mountain-view-ca-5XjKKB/
https://www.zillow.com/homedetails/969-Clark-Ave-D-Mountain-View-CA-94040/2068419946_zpid/
https://www.zillow.com/homedetails/74-Williams-St-San-Leandro-CA-94577/24879175_zpid/
The problem is the website. It adds the links dynamicaly, so you can try scrolling to the bottom of the page and than searching for the links.
bottomFooter = driver.find_element_by_id("region-info-footer")
driver.execute_script("arguments[0].scrollIntoView();", bottomFooter)

How to get href with BeautifulSoup

The Situation
I want to scrape from this website:
http://www.dpm.tn/dpm_pharm/medicament/listmedicparnomspec.php
My code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
# agent
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
# headless driver
options = webdriver.ChromeOptions()
options.headless = True
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--window-size=1920,1080")
options.add_argument('--ignore-certificate-errors')
options.add_argument('--allow-running-insecure-content')
options.add_argument("--disable-extensions")
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
options.add_argument("--start-maximized")
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(executable_path="D:\Downloads\chromedriver.exe", options=options)
# request test
medecine = 'doliprane'
# submiting a search
driver.get('http://www.dpm.tn/dpm_pharm/medicament/listmedicparnomspec.php')
e = driver.find_element_by_name('id')
e.send_keys(medecine)
e.submit()
# geting the result table
try:
table = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody')
print('succes')
except:
print('failed')
The code to get the link :
print('bs4 turn \n')
result = BeautifulSoup(table.get_attribute('innerHTML'), 'lxml')
rows = result.find_all('tr')
links = []
real_link = []
for row in rows:
links.append(row.find('a', href= True))
for each in links:
print(each['href'])
The Problem:
Whenever running this I always get this error:
'NoneType' object is not subscriptable
The question:
How can I get this and find the href attribute as required?
Instead of using selenium use the requests library and fetch data and parse it.
Code:
import re
import requests
from bs4 import BeautifulSoup
medecine = 'doliprane'
url = "http://www.dpm.tn/dpm_pharm/medicament/listmedicspec.php"
payload = {"id":medecine}
response = requests.post(url, data=payload)
parsedhtml=BeautifulSoup(response.content,"html.parser")
regex = re.compile('fiche.php.*')
atag=parsedhtml.findAll("a",{"href":regex})
links =[i['href'].replace("fiche.php","http://www.dpm.tn/dpm_pharm/medicament/fiche.php") for i in atag ]
print(links)
Let me know if you have any questions :)
When accessing it try with:
print('bs4 turn \n')
result = BeautifulSoup(table.get_attribute('innerHTML'), 'lxml')
rows = result.find_all('tr')
links = []
real_link = []
for row in rows:
a = row.find("a", href=True)
links.append(a['href'])
for each in links:
print(each)
I solved it but using selenium instead of Beautiful soup:
for i in range(2, max):
a_driver = driver.find_element_by_xpath(f'/html/body/table/tbody/tr/td/table/tbody/tr[{i}]/td[11]/a')
result2 = BeautifulSoup(a_driver.get_attribute('innerHTML'), 'lxml')
link = a_driver.get_attribute('href')
links.append(link)
for i in range(0, len(links)):
print(links[i])
this is worked for me

Unable to scrape google images selenium

I have the following script which i want it to scrapes google images. It clicks on the image first and then clicks on next (>) button to switch to the next image.
It downloads the first image, but when it's turn of the second image then it throws me an error.
Traceback (most recent call last):
File "c:/Users/intel/Desktop/Scrappr/image_scrape.pyw", line 40, in <module>
attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CLASS_NAME, 'n3VNCb'))).get_attribute("src")
File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
My code :
import requests
import shutil
import time
import urllib
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as Soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
driver = webdriver.Chrome(executable_path=r"C:\Users\intel\Downloads\setups\chromedriver.exe", options=options)
driver.get("https://www.google.com/search?q=mac+beautiful+ui&tbm=isch&ved=2ahUKEwiL3ILMveToAhWGCHIKHVPNAScQ2-cCegQIABAA&oq=mac+beautiful+ui&gs_lcp=CgNpbWcQAzoECAAQQzoCCAA6BQgAEIMBOgYIABAFEB46BggAEAgQHlDPI1iEUWCgU2gAcAB4AIAByAKIAd8dkgEHMC40LjkuM5gBAKABAaoBC2d3cy13aXotaW1n&sclient=img&ei=Q9-TXsuuMoaRyAPTmoe4Ag&bih=657&biw=1360")
driver.find_element_by_class_name("rg_i").click()
i = 0
while i < 10:
i += 1
time.sleep(5)
attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'img.n3VNCb'))).get_attribute("src")
print(attribute_value)
resp = requests.get(attribute_value, stream=True)
local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
driver.find_element_by_xpath("""//*[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()
I've tidied up and refactored a bit your code. The final result is capable of grabbing n amount of images for keywords of your choice (see SEARCH_TERMS):
import base64
import os
import requests
import time
from io import BytesIO
from PIL import Image
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
CHROME_DRIVER_LOCATION = r'C:\Users\intel\Downloads\setups\chromedriver.exe'
SEARCH_TERMS = ['very', 'hot', 'chicks']
TARGET_SAVE_LOCATION = os.path.join(r'c:\test', '_'.join([x.capitalize() for x in SEARCH_TERMS]), r'{}.{}')
if not os.path.isdir(os.path.dirname(TARGET_SAVE_LOCATION)):
os.makedirs(os.path.dirname(TARGET_SAVE_LOCATION))
def check_if_result_b64(source):
possible_header = source.split(',')[0]
if possible_header.startswith('data') and ';base64' in possible_header:
image_type = possible_header.replace('data:image/', '').replace(';base64', '')
return image_type
return False
def get_driver():
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/80.0.3987.132 Safari/537.36'
options = Options()
#options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
new_driver = webdriver.Chrome(executable_path=CHROME_DRIVER_LOCATION, options=options)
new_driver.get(f"https://www.google.com/search?q={'+'.join(SEARCH_TERMS)}&source=lnms&tbm=isch&sa=X")
return new_driver
driver = get_driver()
first_search_result = driver.find_elements_by_xpath('//a/div/img')[0]
first_search_result.click()
right_panel_base = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'''//*[#data-query="{' '.join(SEARCH_TERMS)}"]''')))
first_image = right_panel_base.find_elements_by_xpath('//*[#data-noaft="1"]')[0]
magic_class = first_image.get_attribute('class')
image_finder_xp = f'//*[#class="{magic_class}"]'
# initial wait for the first image to be loaded
# this part could be improved but I couldn't find a proper way of doing it
time.sleep(3)
# initial thumbnail for "to_be_loaded image"
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
for i in range(10):
# issue 4: All image elements share the same class. Assuming that you always click "next":
# The last element is the base64 encoded thumbnail version is of the "next image"
# [-2] element is the element currently displayed
target = driver.find_elements_by_xpath(image_finder_xp)[-2]
# you need to wait until image is completely loaded:
# first the base64 encoded thumbnail will be displayed
# so we check if the displayed element src match the cached thumbnail src.
# However sometimes the final result is the base64 content, so wait is capped
# at 5 seconds.
wait_time_start = time.time()
while (target.get_attribute("src") == thumbnail_src) and time.time() < wait_time_start + 5:
time.sleep(0.2)
thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")
attribute_value = target.get_attribute("src")
print(attribute_value)
# issue 1: if the image is base64, requests get won't work because the src is not an url
is_b64 = check_if_result_b64(attribute_value)
if is_b64:
image_format = is_b64
content = base64.b64decode(attribute_value.split(';base64')[1])
else:
resp = requests.get(attribute_value, stream=True)
temp_for_image_extension = BytesIO(resp.content)
image = Image.open(temp_for_image_extension)
image_format = image.format
content = resp.content
# issue 2: if you 'open' a file, later you have to close it. Use a "with" pattern instead
with open(TARGET_SAVE_LOCATION.format(i, image_format), 'wb') as f:
f.write(content)
# issue 3: this Xpath is bad """//*[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""" if page layout changes, this path breaks instantly
svg_arrows_xpath = '//div[#jscontroller]//a[contains(#jsaction, "click:trigger")]//*[#viewBox="0 0 24 24"]'
next_arrow = driver.find_elements_by_xpath(svg_arrows_xpath)[-3]
next_arrow.click()
Disclaimer: I doubt that Google allows scraping on Search. You should check out https://www.google.com/robots.txt to find out.
That being said, I think there is a problem in your WebDriverWait method, though I am not sure what exactly it is. Since you already have your driver wait before that with time.sleep, I just tried to find the element directly, and it worked:
i = 0
while i < 10:
i += 1
time.sleep(5)
attribute_value = driver.find_element_by_css_selector("img.n3VNCb").get_attribute("src") # NEW LINE
print(attribute_value)
resp = requests.get(attribute_value, stream=True)
local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
driver.find_element_by_xpath("""//*[#id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()

Categories