Scrape page using Python requests - python

I have some problems with web scraping, here is my code:
from bs4 import BeautifulSoup
import requests
import re
import csv
import argparse
def save_csv_file(filename, array):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["item_name","item_price","item_category"])
writer.writerows(array)
def process_data(name, price, category):
item_name = name.text if name else 'NA'
item_price = price.text if price else 'NA'
item_category = category.text if category else 'NA'
item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
return (item_name, item_price, item_category)
def do_scrap(filename, url, payload, headers):
# Request the URL with parameters and headers
r = requests.post(url, payload, headers = headers, allow_redirects = True)
if(r.status_code == 200):
# Save response content in html variable
html = r.content
# Parsed html variable into HTML file with bs4
parsed_html = BeautifulSoup(html, "html.parser")
# Print document title
print parsed_html.head.find('title').text
# Find all of the HTML elements which are describing hotels
tables = parsed_html.find_all("a", {"class" : "result-link"})
# Print the numbers of the hotels
print "Found %s records." % len(tables)
# Empty helpers
items = []
count = 0
# Looping the HTML elements and print properties for each hotel
for table in tables:
name = table.find("h3", {"class" : "result-title"})
price = table.find("p", {"class" : "price text-truncate"})
category = table.find("p", {"class" : "merchant-name text-truncate"})
items.append(process_data(name, price, category))
count += 1
if count > 0:
# Save array with data to csv file
save_csv_file(filename = filename, array = items)
# Print end of job info
print "\n%s records downloaded and saved to %s." % (count, filename)
else:
print "Code error: %s" % r.status_code
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")
args = vars(ap.parse_args())
product = args['product']
category = args['category']
payload = {
'siteSearchQuery':product,
'from':'colibri'
}
headers = {
'Host':'www.kelkoo.co.uk',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}
url = "http://www.kelkoo.co.uk/ctl/do/search"
filename = "%s_co_uk_kelkoo_data.csv" % product
do_scrap(
filename=filename,
url=url,
payload=payload,
headers=headers)
After this request I am getting different result than I put this:
www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri
into my web browser, what is causing this problem? Is there is something related to page redirection or something?

I can see multiple things that will cause you to get different results:
You initiate a POST not a GET. Lookup params for requests.get.
They use javascript to modify the page.

Related

How to import the data (list of URLs) from a CSV instead of use an search string? Thank you a lot

I´m so newbie in Python and I need to unite the first and second code cause I want to read the data (phones and ZIPs) from a a list of URLs store in a CSV. I tried a lot.
This code is from Python 360 on YouTube:
# read csv with just url per line
with open('urls.csv') as file:
start_urls = [line.strip() for line in file]
def start_request(self):
request = Request(url = self.start_urls, callback=self.parse)
yield request
def parse(self, response):
html = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
Ans this from nageshsinghc4 (GitHub) :)
...
for row in df2.iterrows(): # Parse through each url in the list.
try:
try:
req1 = Request(row[1]['URL'], headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
f = urlopen(req1, context=gcontext)
url_name = f.geturl() #extract URL name
s = f.read()
phone = re.findall(r"((?:\d{3}|\(\d{3}\))?(?:\s|-|\.)?\d{3}(?:\s|-|\.)\d{4})",s) # Phone regex
emails = re.findall(r"[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s) #Email regex
if len(phone) == 0:
print("No phone number found.")
err_msg_phn = "No phone number found."
phn_1.append((url_name, err_msg_phn))
else:
count = 1
for item in phone:
phn_1.append((url_name,item))
count += 1
print(phn_1)
if len(emails) == 0:
print("No email address found.")
err_msg_mail = "No email address found."
mail_1.append((url_name,err_msg_mail))
else:
count = 1
for item in emails:
mail_1.append((url_name,item))
count += 1
print(mail_1)
Thank you very much.

Python scraping data of multiple pages issue

I'm getting one issue my code scrape everything from only the first page. But I want to scrape data of multiple pages same as from the first page. Actully I also wrote a code for multiple pages and it also move forward to next page but scrape data of first page again. please have a look at my code and gude me how can i fix this issue. thanks!
here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="cdm_style",id=False).text)
except:
title = 'Empty Title'
try:
collection = (soup.find('td',id="metadata_collec").find('a').text)
except:
collection = "Empty Collection"
try:
author = (soup.find('td',id="metadata_creato").text)
except:
author = "Empty Author"
try:
abstract = (soup.find('td',id="metadata_descri").text)
except:
abstract = "Empty Abstract"
try:
keywords = (soup.find('td',id="metadata_keywor").text)
except:
keywords = "Empty Keywords"
try:
publishers = (soup.find('td',id="metadata_publis").text)
except:
publishers = "Empty Publishers"
try:
date_original = (soup.find('td',id="metadata_contri").text)
except:
date_original = "Empty Date original"
try:
date_digital = (soup.find('td',id="metadata_date").text)
except:
date_digital = "Empty Date digital"
try:
formatt = (soup.find('td',id="metadata_source").text)
except:
formatt = "Empty Format"
try:
release_statement = (soup.find('td',id="metadata_rights").text)
except:
release_statement = "Empty Realease Statement"
try:
library = (soup.find('td',id="metadata_librar").text)
except:
library = "Empty Library"
try:
date_created = (soup.find('td',id="metadata_dmcreated").text)
except:
date_created = "Empty date Created"
data = {
'Title' : title.strip(),
'Collection' : collection.strip(),
'Author' : author.strip(),
'Abstract' : abstract.strip(),
'Keywords' : keywords.strip(),
'Publishers' : publishers.strip(),
'Date_original': date_original.strip(),
'Date_digital' : date_digital.strip(),
'Format' : formatt.strip(),
'Release-st' : release_statement.strip(),
'Library' : library.strip(),
'Date_created' : date_created.strip()
}
return data
def get_index_data(soup):
try:
titles_link = soup.find_all('a',class_="body_link_11")
except:
titles_link = []
else:
titles_link_output = []
for link in titles_link:
try:
item_id = link.attrs.get('item_id', None) #All titles with valid links will have an item_id
if item_id:
titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
except:
continue
return titles_link_output
def write_csv(data,url):
with open('1111_to_5555.csv','a') as csvfile:
writer = csv.writer(csvfile)
row = [data['Title'], data['Collection'], data['Author'],
data['Abstract'], data['Keywords'], data['Publishers'], data['Date_original'],
data['Date_digital'], data['Format'], data['Release-st'], data['Library'],
data['Date_created'], url]
writer.writerow(row)
def main():
for x in range(2,4):
mainurl = ("http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/")
print(x)
url = f"{mainurl}{x}"
products = get_index_data(get_page(url))
for product in products:
data1 = get_detail_page(get_page(product))
write_csv(data1,product)
if __name__ == '__main__':
main()
in get_page() function, try to add headers on upon requests
def get_page(url):
headers = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
response = requests.get(url, headers=headers)

Why does my web scraper only work half the time?

My goal is to get the product name and price of all Amazon pages detected in any website that I feed to my program.
My input is a text file containing five websites. In each of these websites, a total of five to fifteen amazon links are to be found.
My code is this:
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
from urllib2 import Request, urlopen, HTTPError, URLError
def isdead(url):
user_agent = 'Mozilla/20.0.1 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent':user_agent }
req = Request(url, headers = headers)
sleep(10)
try:
page_open = urlopen(req)
except HTTPError, e:
return e.code #404 if link is broken
except URLError, e:
return e.reason
else:
return False
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip()
#re.... is a list. if empty, available. if not, unavailable.
#return re.findall(r'Available from',AVAILABILITY[:30], re.IGNORECASE)
if len(re.findall(r'unavailable',AVAILABILITY[:30],re.IGNORECASE)) == 1:
return "unavailable"
else:
return "available"
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
all_links = [x for x in all_links if "amazon.com/gp/prime" not in x]
all_links = [y for y in all_links if "amazon.com/product-reviews" not in y]
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
if isdead(i) == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == "unavailable":
price = "UNAVAILABLE"
else:
if (len(soup.select("#priceblock_ourprice")) == 0) and (len(soup.select("#priceblock_saleprice")) == 0):
price = soup.select("#a-offscreen")
elif len(soup.select("#priceblock_ourprice")) == 0:
price = soup.select("#priceblock_saleprice")
else:
price = soup.select("#priceblock_ourprice")
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
Whenever it works fine, the output looks something like this (please don't judge the PRICE output, I have spent so much time trying to fix that but nothing works because I can't turn it into a string and get_text() doesn't work. This project is just for personal use so it's not that important, but if you have suggestions, I'm very receptive to those.):
LINK:
https://www.amazon.com/dp/B007Y6LLTM/ref=as_li_ss_tl?ie=UTF8&linkCode=ll1&tag=lunagtkf1-20&linkId=ee8c5299508af57c815ea6577ede4244
TITLE:Moen 7594ESRS Arbor Motionsense Two-Sensor Touchless One-Handle Pulldown Kitchen Faucet Featuring Power Clean, Spot Resist Stainless
PRICE:[<span class="a-size-medium a-color-price priceBlockBuyingPriceString" id="priceblock_ourprice">$359.99</span>]
/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/
... and so on.
The error looks like this:
Traceback (most recent call last):
File "name.py", line 75, in <module>
title = soup.select("#productTitle")[0].get_text().strip()
IndexError: list index out of range
It's so weird because there's a text file that's fed so many times and sometimes, all sites are scraped well, but sometimes, the error appears at the 10th Amazon product, sometimes, the error appears at the 1st product...
I'm suspecting it's a bot detection problem, but I have a header. What's the problem?
Your code is too messy. I've organized it for you, please check out if it works.
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
if response.status_code == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
html = response.text
doc = SimplifiedDoc(html)
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
You should learn more:) and give you an example of using the framework.
Here are more examples of simplified_scrapy here
If you need any help, please let me know.
from simplified_scrapy.spider import Spider, SimplifiedDoc
class MySpider(Spider):
name = 'amazon-product'
# allowed_domains = ['example.com']
start_urls = []
refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
filepath='' # Your file path
if filepath:
with open(filepath) as f:
start_urls = [line.rstrip('\n') for line in f]
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
amazon_links=None
data = None
if url['url'].find('https://www.amazon.com')>=0 or url['url'].find('https://amzn.to')>=0:
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
data = [{"title":title,'price':price}] # Get target data
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
else:
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
return {"Urls": amazon_links, "Data": data} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling

Newbie: Python "AttributeError: 'NoneType' object has no attribute 'text' " when scraping Tripadvisor Reviews

I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
It worked (though, removing the attribute "language") for one link but it doesn't work for any more link (for example.)
I'm receiving the error:
> Traceback (most recent call last):
> File "<pyshell#27>", line 4, in <module>
> items = scrape(url)
> File "<pyshell#12>", line 11, in scrape
> items = parse(session, url + '?filterLang=' + lang)
> File "<pyshell#15>", line 12, in parse
> num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
> AttributeError: 'NoneType' object has no attribute 'text'
I'm attaching the code here with the changes I made in case someone can help me.
Thank you so much!
Silvia
--
I substituted the original:
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
with
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
With the original code I get the error
ValueError: invalid literal for int() with base 10: '5.695'
(where 5.695 is the number of reviews in the page)
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', id='HEADING').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
lang = 'it'
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
I realized the problem lies in the source code.
hotel_name = soup.find('h1', id='HEADING').text
found no target id in the source website. I substituted it with:
hotel_name = soup.find('h1', class_='heading').text
I hope it can help others!

Unable to expand more... python

I can scrape all the reviews from the web page.But I am not getting full content.Only half review content i can scrape.I need to scrape the full content.
from bs4 import BeautifulSoup import requests import re
s = requests.Session()
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
r = s.get(url, headers=headers)
#with open('temp.html', 'wb') as f:
# f.write(r.content)
# webbrowser.open('temp.html')
if r.status_code != 200:
print('status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def parse(url, response):
if not response:
print('no response:', url)
return
# get number of reviews
# num_reviews = response.find('span', class_='reviews_header_count').text
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
# print('num_reviews:', num_reviews, type(num_reviews))
num_reviews = (20)
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
print('num_reviews:', num_reviews, type(num_reviews))
# create template for urls to pages with reviews
url = url.replace('Hilton_New_York_Grand_Central-New_York_City_New_York.html', 'or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html')
print('template:', url)
# add requests to list
for offset in range(0, num_reviews, 5):
print('url:', url.format(offset))
url_ = url.format(offset)
parse_reviews(url_, get_soup(url_))
#return # for test only - to stop after first page
def parse_reviews(url, response):
print('review:', url)
if not response:
print('no response:', url)
return
for idx, review in enumerate(response.find_all('div', class_='review-container')):
item = {
'hotel_name': response.find('h1', class_='heading_title').text,
'review_title': review.find('span', class_='noQuotes').text,
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
# 'num_reviews_reviewer': review.find('span', class_='badgetext').text,
'reviewer_name': review.find('span', class_='scrname').text,
'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
}
#~ yield item
results.append(item)
for key,val in item.items():
print(key, ':', val)
print('----')
#return # for test only - to stop after first review
start_urls = [
'https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-Hilton_New_York_Grand_Central-New_York_City_New_York.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html', ]
results = []
for url in start_urls:
parse(url, get_soup(url))
import pandas as pd
df = pd.DataFrame(results) # <--- convert list to DataFrame df.to_csv('output.csv')
I am getting an output sample in csv file from review like:
I went on a family trip and it was amazing, I hope to come back soon. The room was small but what can you expect from New York. It was close to many things and the staff was perfect.I will come back again soon.More...
I just want to expand that more. I need a help..I really have no clue to do it.Please help.
I have written one more code but unable to pull the id from next page.Code is given below
import re
import urllib
#import webbrowser``
s = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
for i in range(0,10,5):
url = ("https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html").format(i)
print(url)
r = s.get(url,headers=headers)
html = BeautifulSoup(r.text, 'html.parser')
pattern = re.compile(r"UID_(\w+)\-SRC_(\w+)")
id = soup.find("div", id=pattern)["id"]
uid = pattern.match(id).group(2)
print(uid)
url1 ="https://www.tripadvisor.in/ShowUserReviews-g60763-d93339-r"+str(uid)+"-Hilton_New_York_Grand_Central-New_York_City_New_York.html#CHECK_RATES_CONT"
print(url1)
url2 = ('"' + url1 + '"')`enter code here`
print(url2)
The site uses ajax to expand the review content. The full content is not downloaded until the More link is clicked.
One way to access the content would be to figure out the ajax request format and then issue a HTTP request for the same. That might be difficult, perhaps not.
Another, easier, way is by noticing that the review title is a clickable link which loads the full review in a new page. You can therefore scrape the URL for each review and send a similar GET request. Then scrape the data from the response.

Categories