I am new in the web scraping field. So hopefully this question is clear.
I found a tutorial on the internet to scrape Amazon data, based on a given ASIN (unique Amazon number). See : https://www.scrapehero.com/tutorial-how-to-scrape-amazon-product-details-using-python/
When running this code (I adjusted a bit of the code) I faced the issue that I received every time different results (even when running 5 seconds later). In my example, one time the Titles are found, but 5 seconds later the result is NULL.
I think the reason is because I searched the XPATH via Google Chrome, and in the beginning of the code, there is the
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
My question: how can I scrape the content on a stable way? (e.g.: getting the real results of the pages, by using ASIN numbers)
Below the code for reproducing. You can run the script via the command line:
python script_name.py
Thanks a lot for your help!
The script:
from lxml import html
import csv,os,json
import requests
#from exceptions import ValueError
from time import sleep
def AmzonParser(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url,headers=headers)
while True:
sleep(5)
try:
doc = html.fromstring(page.content)
# Title
XPATH_NAME = '//*[#id="productTitle"]/text()'
XPATH_NAME1 = doc.xpath(XPATH_NAME)
TITLE = ' '.join(''.join(XPATH_NAME1).split()) if XPATH_NAME1 else None
#XPATH_SALE_PRICE = '//span[contains(#id,"ourprice") or contains(#id,"saleprice")]/text()'
#XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
#XPATH_CATEGORY = '//a[#class="a-link-normal a-color-tertiary"]//text()'
#XPATH_AVAILABILITY = '//div[#id="availability"]//text()'
#RAW_NAME = doc.xpath(XPATH_NAME)
#RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
#RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
#RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
#RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
#NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
#SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
#CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
#ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
#AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
#if not ORIGINAL_PRICE:
# ORIGINAL_PRICE = SALE_PRICE
if page.status_code!=200:
raise ValueError('captha')
data = {
'TITLE':TITLE
#'SALE_PRICE':SALE_PRICE,
#'CATEGORY':CATEGORY,
#'ORIGINAL_PRICE':ORIGINAL_PRICE,
#'AVAILABILITY':AVAILABILITY,
#'URL':url,
}
return data
except Exception as e:
print(e)
def ReadAsin():
# AsinList = csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
AsinList = [
'B00AEINQ9K',
'B00JWP8F3I']
extracted_data = []
for i in AsinList:
url = "http://www.amazon.com/dp/"+i
print ("Processing: "+url)
extracted_data.append(AmzonParser(url))
sleep(5)
f=open('data_scraped_data.json','w')
json.dump(extracted_data,f,indent=4)
if __name__ == "__main__":
ReadAsin()
Related
I am trying to extract the first 100 urls that return from a location search in google
however i am getting an empty list every time ("no results found")
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = []
if results:
counter = 0
for result in results:
websites.append(result.find("a")["href"])
counter += 1
if counter == 100:
break
else:
print("No search results found.")
return websites
location = "Athens"
print(get_location_info(location))
No search results found.
[]
I have also tried this approach :
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = [result.find("a")["href"] for result in results][:10]
return websites
location = "sifnos"
print(get_location_info(location))`
and i get an empty list. I think i am doing everything suggested in similar posts but i still get nothing
Always and first of all, take a look at your soup to see if all the expected ingredients are in place.
Select your elements more specific in this case for example with css selector:
[a.get('href') for a in soup.select('a:has(>h3)')]
To void consent banner also send some cookies:
cookies={'CONSENT':'YES+'}
Example
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers, cookies={'CONSENT':'YES+'})
soup = BeautifulSoup(response.text, 'html.parser')
websites = [a.get('href') for a in soup.select('a:has(>h3)')]
return websites
location = "sifnos"
print(get_location_info(location))
Output
['https://www.griechenland.de/sifnos/', 'http://de.sifnos-greece.com/plan-trip-to-sifnos/travel-information.php', 'https://www.sifnosisland.gr/', 'https://www.visitgreece.gr/islands/cyclades/sifnos/', 'http://www.griechenland-insel.de/Hauptseiten/sifnos.htm', 'https://worldonabudget.de/sifnos-griechenland/', 'https://goodmorningworld.de/sifnos-griechenland/', 'https://de.wikipedia.org/wiki/Sifnos', 'https://sifnos.gr/en/sifnos/', 'https://www.discovergreece.com/de/cyclades/sifnos']
I have this problem with a simple xpath, I can't figure out why it's not working.
I copied the function from a working function, and i seriously don't have a clue why this doesn't work.
I read several tutorials and have a working function in another script, but this function doesn't do what I want. It should get some strings from the webpage, but I just get empty variables.
def getWeather():
try:
page = requests.get('https://www.google.com/search?q=wetter&oq=wetter&ie=UTF-8')
except:
print('URL not reachable')
tree = html.fromstring(page.content)
#print( tree )
weatherInfo = tree.xpath('//span[#id="wob_dc"]/text()')
tempInfo = tree.xpath('//span[#id="wob_tm"]/text()')
windInfo = tree.xpath('//span[#id="wob_ws"]/text()')
print (weatherInfo) # empty
r = str(weatherInfo) + " " + str(tempInfo) + " " + str(windInfo)
return r
Can you give any advice?
It's all about headers in your requests. This sample works for me:
from lxml import html
import requests
def getWeather():
try:
page = requests.get(
'https://www.google.com/search?q=wetter&oq=wetter&ie=UTF-8',
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
}
)
except:
print('URL not reachable')
tree = html.fromstring(page.content)
#print( tree )
weatherInfo = tree.xpath('//span[#id="wob_dc"]/text()')
tempInfo = tree.xpath('//span[#id="wob_tm"]/text()')
windInfo = tree.xpath('//span[#id="wob_ws"]/text()')
print (weatherInfo) # empty
r = str(weatherInfo) + " " + str(tempInfo) + " " + str(windInfo)
return r
getWeather()
This is because of Google. Their servers does not get rentable pages. So this question is not for python, but for a web-developers.
Version of non-web-developer(me): server creates page with weather, then sends you according your location and then deletes this. If you aren't from German, you get another page.
Problem isn't in xpath, but in request.
P.S.: I checked this code on my own with another link and it works.
I wrote this code but got this as the error "IndexError: list index out of range" after running the last line. Please, how do I fix this?
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)
content = response.content
soup = BeautifulSoup(content,"html.parser")
top_rest = soup.find_all("div",attrs={"class": "sc-bblaLu dOXFUL"})
list_tr = top_rest[0].find_all("div",attrs={"class": "sc-gTAwTn cKXlHE"})
list_rest =[]
for tr in list_tr:
dataframe ={}
dataframe["rest_name"] = (tr.find("div",attrs={"class": "res_title zblack bold nowrap"})).text.replace('\n', ' ')
dataframe["rest_address"] = (tr.find("div",attrs={"class": "nowrap grey-text fontsize5 ttupper"})).text.replace('\n', ' ')
dataframe["cuisine_type"] = (tr.find("div",attrs={"class":"nowrap grey-text"})).text.replace('\n', ' ')
list_rest.append(dataframe)
list_rest
You are receiving this error because top_rest is empty when you attempt to get the first element of it "top_rest[0]". The reason for that is because the first class your attempting to reference is dynamically named. You will notice if you refresh the page the same location of that div will not be named the same. So when you attempt to scrape you get empty results.
An alternative would be to scrape ALL divs, then narrow in on the elements you want, be mindful of the dynamic div naming schema so from one request to another you will get different results:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)
content = response.content
soup = BeautifulSoup(content,"html.parser")
top_rest = soup.find_all("div")
list_tr = top_rest[0].find_all("div",attrs={"class": "bke1zw-1 eMsYsc"})
list_tr
I recently did a project that made me research scraping the Zomato's website in Manila, Philippines. I used Geolibrary to get the longitude and latitude values of Manila City, then scraped the restaurants' details using this information.
ADD: You can get your own API key on zomato website to make up to 1000 calls in a day.
# Use geopy library to get the latitude and longitude values of Manila City.
from geopy.geocoders import Nominatim
address = 'Manila City, Philippines'
geolocator = Nominatim(user_agent = 'Makati_explorer')
location = geolocator.geocode(address)
latitude = location.lenter code hereatitude
longitude = location.longitude
print('The geographical coordinate of Makati City are {}, {}.'.format(latitude, longitude))
# Use Zomato's API to make call
headers = {'user-key': '617e6e315c6ec2ad5234e884957bfa4d'}
venues_information = []
for index, row in foursquare_venues.iterrows():
print("Fetching data for venue: {}".format(index + 1))
venue = []
url = ('https://developers.zomato.com/api/v2.1/search?q={}' +
'&start=0&count=1&lat={}&lon={}&sort=real_distance').format(row['name'], row['lat'], row['lng'])
try:
result = requests.get(url, headers = headers).json()
except:
print("There was an error...")
try:
if (len(result['restaurants']) > 0):
venue.append(result['restaurants'][0]['restaurant']['name'])
venue.append(result['restaurants'][0]['restaurant']['location']['latitude'])
venue.append(result['restaurants'][0]['restaurant']['location']['longitude'])
venue.append(result['restaurants'][0]['restaurant']['average_cost_for_two'])
venue.append(result['restaurants'][0]['restaurant']['price_range'])
venue.append(result['restaurants'][0]['restaurant']['user_rating']['aggregate_rating'])
venue.append(result['restaurants'][0]['restaurant']['location']['address'])
venues_information.append(venue)
else:
venues_information.append(np.zeros(6))
except:
pass
ZomatoVenues = pd.DataFrame(venues_information,
columns = ['venue', 'latitude',
'longitude', 'price_for_two',
'price_range', 'rating', 'address'])
Using Web Scraping Language I was able to write this:
GOTO https://www.zomato.com/bangalore/top-restaurants
EXTRACT {'rest_name': '//div[#class="res_title zblack bold nowrap"]',
'rest_address': '//div[#class="nowrap grey-text fontsize5 ttupper',
'cusine_type': '//div[#class="nowrap grey-text"]'} IN //div[#class="bke1zw-1 eMsYsc"]
This will iterate over each record element with class bke1zw-1 eMsYsc and pull
each restaurant information.
My goal is to get the product name and price of all Amazon pages detected in any website that I feed to my program.
My input is a text file containing five websites. In each of these websites, a total of five to fifteen amazon links are to be found.
My code is this:
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
from urllib2 import Request, urlopen, HTTPError, URLError
def isdead(url):
user_agent = 'Mozilla/20.0.1 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent':user_agent }
req = Request(url, headers = headers)
sleep(10)
try:
page_open = urlopen(req)
except HTTPError, e:
return e.code #404 if link is broken
except URLError, e:
return e.reason
else:
return False
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip()
#re.... is a list. if empty, available. if not, unavailable.
#return re.findall(r'Available from',AVAILABILITY[:30], re.IGNORECASE)
if len(re.findall(r'unavailable',AVAILABILITY[:30],re.IGNORECASE)) == 1:
return "unavailable"
else:
return "available"
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
all_links = [x for x in all_links if "amazon.com/gp/prime" not in x]
all_links = [y for y in all_links if "amazon.com/product-reviews" not in y]
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
if isdead(i) == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == "unavailable":
price = "UNAVAILABLE"
else:
if (len(soup.select("#priceblock_ourprice")) == 0) and (len(soup.select("#priceblock_saleprice")) == 0):
price = soup.select("#a-offscreen")
elif len(soup.select("#priceblock_ourprice")) == 0:
price = soup.select("#priceblock_saleprice")
else:
price = soup.select("#priceblock_ourprice")
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
Whenever it works fine, the output looks something like this (please don't judge the PRICE output, I have spent so much time trying to fix that but nothing works because I can't turn it into a string and get_text() doesn't work. This project is just for personal use so it's not that important, but if you have suggestions, I'm very receptive to those.):
LINK:
https://www.amazon.com/dp/B007Y6LLTM/ref=as_li_ss_tl?ie=UTF8&linkCode=ll1&tag=lunagtkf1-20&linkId=ee8c5299508af57c815ea6577ede4244
TITLE:Moen 7594ESRS Arbor Motionsense Two-Sensor Touchless One-Handle Pulldown Kitchen Faucet Featuring Power Clean, Spot Resist Stainless
PRICE:[<span class="a-size-medium a-color-price priceBlockBuyingPriceString" id="priceblock_ourprice">$359.99</span>]
/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/
... and so on.
The error looks like this:
Traceback (most recent call last):
File "name.py", line 75, in <module>
title = soup.select("#productTitle")[0].get_text().strip()
IndexError: list index out of range
It's so weird because there's a text file that's fed so many times and sometimes, all sites are scraped well, but sometimes, the error appears at the 10th Amazon product, sometimes, the error appears at the 1st product...
I'm suspecting it's a bot detection problem, but I have a header. What's the problem?
Your code is too messy. I've organized it for you, please check out if it works.
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
if response.status_code == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
html = response.text
doc = SimplifiedDoc(html)
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
You should learn more:) and give you an example of using the framework.
Here are more examples of simplified_scrapy here
If you need any help, please let me know.
from simplified_scrapy.spider import Spider, SimplifiedDoc
class MySpider(Spider):
name = 'amazon-product'
# allowed_domains = ['example.com']
start_urls = []
refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
filepath='' # Your file path
if filepath:
with open(filepath) as f:
start_urls = [line.rstrip('\n') for line in f]
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
amazon_links=None
data = None
if url['url'].find('https://www.amazon.com')>=0 or url['url'].find('https://amzn.to')>=0:
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
data = [{"title":title,'price':price}] # Get target data
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
else:
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
return {"Urls": amazon_links, "Data": data} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling
I've been playing around with web-scraping (for this practice exercise using Python 3.6.2) and I feel like I'm loosing it a bit. Given this example link, here's what I want to do:
First, as you can see, there are multiple categories on the page. Clicking each of the categories from above will give me other categories, then other ones, an so on, until I reach the products page. So I have to go in depth x number of times. I thought recursion will help me achieve this, but somewhere I did something wrong.
Code:
Here, I'll explain the way I approached the problem. First, I created a session and a simple generic function which will return a lxml.html.HtmlElement object:
from lxml import html
from requests import Session
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/62.0.3202.94 Safari/537.36"
}
TEST_LINK = 'https://www.richelieu.com/us/en/category/custom-made-cabinet-doors-and-drawers/1000128'
session_ = Session()
def get_page(url):
page = session_.get(url, headers=HEADERS).text
return html.fromstring(page)
Then, I thought I'll need two other functions:
one to get the category links
and another one to get the product links
To distinguish between one and another, I figured out that only on category pages, there's a title which contains CATEGORIES every time, so I used that:
def read_categories(page):
categs = []
try:
if 'CATEGORIES' in page.xpath('//div[#class="boxData"][2]/h2')[0].text.strip():
for a in page.xpath('//*[#id="carouselSegment2b"]//li//a'):
categs.append(a.attrib["href"])
return categs
else:
return None
except Exception:
return None
def read_products(page):
return [
a_tag.attrib["href"]
for a_tag in page.xpath("//ul[#id='prodResult']/li//div[#class='imgWrapper']/a")
]
Now, the only thing left, is the recursion part, where I'm sure I did something wrong:
def read_all_categories(page):
cat = read_categories(page)
if not cat:
yield read_products(page)
else:
yield from read_all_categories(page)
def main():
main_page = get_page(TEST_LINK)
for links in read_all_categories(main_page):
print(links)
Here's all the code put together:
from lxml import html
from requests import Session
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/62.0.3202.94 Safari/537.36"
}
TEST_LINK = 'https://www.richelieu.com/us/en/category/custom-made-cabinet-doors-and-drawers/1000128'
session_ = Session()
def get_page(url):
page = session_.get(url, headers=HEADERS).text
return html.fromstring(page)
def read_categories(page):
categs = []
try:
if 'CATEGORIES' in page.xpath('//div[#class="boxData"][2]/h2')[0].text.strip():
for a in page.xpath('//*[#id="carouselSegment2b"]//li//a'):
categs.append(a.attrib["href"])
return categs
else:
return None
except Exception:
return None
def read_products(page):
return [
a_tag.attrib["href"]
for a_tag in page.xpath("//ul[#id='prodResult']/li//div[#class='imgWrapper']/a")
]
def read_all_categories(page):
cat = read_categories(page)
if not cat:
yield read_products(page)
else:
yield from read_all_categories(page)
def main():
main_page = get_page(TEST_LINK)
for links in read_all_categories(main_page):
print(links)
if __name__ == '__main__':
main()
Could someone please point me into the right direction regarding the recursion function?
Here is how I would solve this:
from lxml import html as html_parser
from requests import Session
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
}
def dig_up_products(url, session=Session()):
html = session.get(url, headers=HEADERS).text
page = html_parser.fromstring(html)
# if it appears to be a categories page, recurse
for link in page.xpath('//h2[contains(., "CATEGORIES")]/'
'following-sibling::div[#id="carouselSegment1b"]//li//a'):
yield from dig_up_products(link.attrib["href"], session)
# if it appears to be a products page, return the links
for link in page.xpath('//ul[#id="prodResult"]/li//div[#class="imgWrapper"]/a'):
yield link.attrib["href"]
def main():
start = 'https://www.richelieu.com/us/en/category/custom-made-cabinet-doors-and-drawers/1000128'
for link in dig_up_products(start):
print(link)
if __name__ == '__main__':
main()
There is nothing wrong with iterating over an empty XPath expression result, so you can simply put both cases (categories page/products page) into the same function, as long as the XPath expressions are specific enough to identify each case.
You can do like this as well to make your script slightly concise. I used lxml library along with css selector to do the job. The script will parse all the links under category and look for the dead end, when it appears then it parse title from there and do the whole stuff over and over again until all the links are exhausted.
from lxml.html import fromstring
import requests
def products_links(link):
res = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
page = fromstring(res.text)
try:
for item in page.cssselect(".contentHeading h1"): #check for the match available in target page
print(item.text)
except:
pass
for link in page.cssselect("h2:contains('CATEGORIES')+[id^='carouselSegment'] .touchcarousel-item a"):
products_links(link.attrib["href"])
if __name__ == '__main__':
main_page = 'https://www.richelieu.com/us/en/category/custom-made-cabinet-doors-and-drawers/1000128'
products_links(main_page)
Partial result:
BRILLANTÉ DOORS
BRILLANTÉ DRAWER FRONTS
BRILLANTÉ CUT TO SIZE PANELS
BRILLANTÉ EDGEBANDING
LACQUERED ZENIT DOORS
ZENIT CUT-TO-SIZE PANELS
EDGEBANDING
ZENIT CUT-TO-SIZE PANELS