I can scrape all the reviews from the web page.But I am not getting full content.Only half review content i can scrape.I need to scrape the full content.
from bs4 import BeautifulSoup import requests import re
s = requests.Session()
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
r = s.get(url, headers=headers)
#with open('temp.html', 'wb') as f:
# f.write(r.content)
# webbrowser.open('temp.html')
if r.status_code != 200:
print('status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def parse(url, response):
if not response:
print('no response:', url)
return
# get number of reviews
# num_reviews = response.find('span', class_='reviews_header_count').text
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
# print('num_reviews:', num_reviews, type(num_reviews))
num_reviews = (20)
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
print('num_reviews:', num_reviews, type(num_reviews))
# create template for urls to pages with reviews
url = url.replace('Hilton_New_York_Grand_Central-New_York_City_New_York.html', 'or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html')
print('template:', url)
# add requests to list
for offset in range(0, num_reviews, 5):
print('url:', url.format(offset))
url_ = url.format(offset)
parse_reviews(url_, get_soup(url_))
#return # for test only - to stop after first page
def parse_reviews(url, response):
print('review:', url)
if not response:
print('no response:', url)
return
for idx, review in enumerate(response.find_all('div', class_='review-container')):
item = {
'hotel_name': response.find('h1', class_='heading_title').text,
'review_title': review.find('span', class_='noQuotes').text,
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
# 'num_reviews_reviewer': review.find('span', class_='badgetext').text,
'reviewer_name': review.find('span', class_='scrname').text,
'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
}
#~ yield item
results.append(item)
for key,val in item.items():
print(key, ':', val)
print('----')
#return # for test only - to stop after first review
start_urls = [
'https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-Hilton_New_York_Grand_Central-New_York_City_New_York.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html', ]
results = []
for url in start_urls:
parse(url, get_soup(url))
import pandas as pd
df = pd.DataFrame(results) # <--- convert list to DataFrame df.to_csv('output.csv')
I am getting an output sample in csv file from review like:
I went on a family trip and it was amazing, I hope to come back soon. The room was small but what can you expect from New York. It was close to many things and the staff was perfect.I will come back again soon.More...
I just want to expand that more. I need a help..I really have no clue to do it.Please help.
I have written one more code but unable to pull the id from next page.Code is given below
import re
import urllib
#import webbrowser``
s = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
for i in range(0,10,5):
url = ("https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html").format(i)
print(url)
r = s.get(url,headers=headers)
html = BeautifulSoup(r.text, 'html.parser')
pattern = re.compile(r"UID_(\w+)\-SRC_(\w+)")
id = soup.find("div", id=pattern)["id"]
uid = pattern.match(id).group(2)
print(uid)
url1 ="https://www.tripadvisor.in/ShowUserReviews-g60763-d93339-r"+str(uid)+"-Hilton_New_York_Grand_Central-New_York_City_New_York.html#CHECK_RATES_CONT"
print(url1)
url2 = ('"' + url1 + '"')`enter code here`
print(url2)
The site uses ajax to expand the review content. The full content is not downloaded until the More link is clicked.
One way to access the content would be to figure out the ajax request format and then issue a HTTP request for the same. That might be difficult, perhaps not.
Another, easier, way is by noticing that the review title is a clickable link which loads the full review in a new page. You can therefore scrape the URL for each review and send a similar GET request. Then scrape the data from the response.
Related
<img class="no-img" data-src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium" alt="Biryani By Kilo" data-gatype="RestaurantImageClick" data-url="/delhi/biryani-by-kilo-connaught-place-central-delhi-40178" data-w-onclick="cardClickHandler" src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium">
page url - https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p=1
this page contains some restaurants card now while scrapping the page in the loop I want to go inside the restaurant card URL which is in the above HTML code name by data-url class and scrape the no. of reviews from inside it, I don't know how to do it my current code for normal front page scrapping is ;
def extract(page):
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = requests.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup): # function to scrape the page
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
title = item.find('a').text.strip() # restaurant name
loc = item.find('div', class_ = 'restnt-loc ellipsis').text.strip() # restaurant location
try: # used this try and except method because some restaurants are unrated and while scrpaping those we would run into an error
rating = item.find('div', class_="img-wrap").text
rating = (re.sub("[^0-9,.]", "", rating))
except:
rating = None
pricce = item.find('span', class_="double-line-ellipsis").text.strip() # price for biriyani
price = re.sub("[^0-9]", "", pricce)[:-1]
biry_del = {
'name': title,
'location': loc,
'rating': rating,
'price': price
}
rest_list.append(biry_del)
rest_list = []
for i in range(1,18):
print(f'getting page, {i}')
c = extract(i)
transform(c)
I hope you guys understood please ask in comment for any confusion.
It's not very fast but it looks like you can get all the details you want including the review count (not 232!) if you hit this backend api endpoint:
https://www.dineout.co.in/get_rdp_data_main/delhi/69676/restaurant_detail_main
import requests
from bs4 import BeautifulSoup
import pandas as pd
rest_list = []
for page in range(1,3):
print(f'getting page, {page}')
s = requests.Session()
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = s.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
code = item.find('a')['href'].split('-')[-1] # restaurant code
print(f'Getting details for {code}')
data = s.get(f'https://www.dineout.co.in/get_rdp_data_main/delhi/{code}/restaurant_detail_main').json()
info = data['header']
info.pop('share') #clean up csv
info.pop('options')
rest_list.append(info)
df = pd.DataFrame(rest_list)
df.to_csv('dehli_rest.csv',index=False)
I am trying to scrape this website and trying to get the reviews but I am facing an issue,
The page loads only 50 reviews.
To load more you have to click "Show More Reviews" and I don't know how to get all the data as there is no page link, also "Show more Reviews" doesn't have a URL to explore, the address remains the same.
url =
"https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
a = []
url = requests.get(url)
html = url.text
soup = BeautifulSoup(html, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
#print(table)
for x in table:
a.append(x.text)
df = pd.DataFrame(a)
df.to_csv("review.csv", sep='\t')
I know this is not pretty code but I am just trying to get the review text first.
kindly help. As I am little new to this.
Looking at the website, the "Show more reviews" button makes an ajax call and returns the additional info, all you have to do is find it's link and send a get request to it (which I've done with some simple regex):
import requests
import re
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/74.0.3729.169 Chrome/74.0.3729.169 Safari/537.36"
}
url = "https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
Data = []
#Each page equivalant to 50 comments:
MaximumCommentPages = 3
with requests.Session() as session:
info = session.get(url)
#Get product ID, needed for getting more comments
productID = re.search(r'"product_id":(\w*)', info.text).group(1)
#Extract info from main data
soup = BeautifulSoup(info.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Number of pages to get:
#Get additional data:
params = {
"page": "",
"product_id": productID
}
while(MaximumCommentPages > 1): # number 1 because one of them was the main page data which we already extracted!
MaximumCommentPages -= 1
params["page"] = str(MaximumCommentPages)
additionalInfo = session.get("https://www.capterra.com/gdm_reviews", params=params)
print(additionalInfo.url)
#print(additionalInfo.text)
#Extract info for additional info:
soup = BeautifulSoup(additionalInfo.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Extract data the old fashioned way:
counter = 1
with open('review.csv', 'w') as f:
for one in Data:
f.write(str(counter))
f.write(one.text)
f.write('\n')
counter += 1
Notice how I'm using a session to preserve cookies for the ajax call.
Edit 1: You can reload the webpage multiple times and call the ajax again to get even more data.
Edit 2: Save data using your own method.
Edit 3: Changed some stuff, now gets any number of pages for you, saves to file with good' ol open()
There are quite similar scenarios regarding this; but I've been comparing with others.
Getting from Clustered Nodes etc. But somehow; I'm unsure why my for loop isn't iterating and grabbing the text from other elements but only from the first element of the node.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': 'section-trending-search-list'}):
topic = container.select_one(
'div._1waRmo')
if topic:
print(1)
d = {
'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': '_25qBG5'}):
topic = container.select_one('div._1waRmo')
if topic:
d = {'Titles': topic.text.replace("\n", "")}
l.append(d)
return d
except:
d = None
findDiv()
print(l)
Output:
[{'Titles': 'school backpack'}, {'Titles': 'oppo case'}, {'Titles': 'baby chair'}, {'Titles': 'car holder'}, {'Titles': 'sling beg'}]
Again I suggest you use selenium. If you run this again you will see that you will get a different set of 5 dictionaries within the list. Every time you are making a request they are giving 5 random trending items. But they do have a 'change' button. If you use selenium, you might be able to just click that and keep scraping all trending items.
Try this:
toplevel is finding the root of the options, then we find all divs under that.
I hope this is what you want.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
toplevel = html_soup.find('._25qBG5')
for container in toplevel.find_all('div'):
topic = container.select_one('._1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)
This enumerates fine with a local file. When I tried with the url given, the website wasn't returning the html you show.
from requests import get
from bs4 import BeautifulSoup
url = 'path_in_here\\test.html'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
example = open(url,"r")
text = example.read()
#response = get(url, headers=headers)
#html_soup = BeautifulSoup(response.text, 'html.parser')
html_soup = BeautifulSoup(text, 'html.parser')
print (text)
def findDiv():
#try:
print("finding toplevel")
toplevel = html_soup.find("div", { "class": "_25qBG5"} )
print ("found toplevel")
divs = toplevel.findChildren("div", recursive=True)
print("found divs")
for container in divs:
print ("loop")
topic = container.select_one('.1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
#except:
# d = None
# print ("error")
findDiv()
print(l)
I tried the following code to download all pdf file from the links but with that It download all files when I run these code every time. Recommended: First time it should download all pdf, and from next time it should download only which one is new.(it should check first which one is new)
My Code:
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
def getLinks(url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
# get report URL
reportLinks = []
for url in yearLinks:
links = getLinks(url)
#reportLinks.extend(links)
#print(reportLinks)
i =0
for url_ in links:
if "AnnualStatisticalSupplement" not in url_:
url__ = url_.replace("org..", "org").replace("../", "")
response = requests.get(url__, stream=True)
lastindex= url__.rfind('/')
strlen = len(url__)
filename = url__[lastindex:strlen]
with open('/home/pdfs/'+ str(filename), 'wb') as pdffile:
pdffile.write(response.content)
i += 1
print(url__)
print("Download Completed")
Then I need to store that file is Mongo DB, How should i do that by making three column(pdf name, reported date, flag of process).
Sorry for the significant change in your code. because your code is too messy to read.
if you want to download the pdf you don't have since some time, you must add if-loop to control your action. by the way if you add page url into your database that you need not to access one more time to get the pdf name.
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
downloaded = ["2018-02-13.pdf"] # the latest i have
def getLinks(url):
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")
links = [root_url + href.a.get('href') for href in li]
return links
def get_pdf(url,flag=1):
# find page link in the month directory
pdf_page = requests.get(url,headers=headers)
soup = BeautifulSoup(pdf_page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")[::-1] # latest -> old
latest_pdf_set = [root_url + href.a.get('href') for href in li]
# find pdf link
pdf_links = []
for pdf_url in latest_pdf_set:
text = requests.get(pdf_url,headers=headers).text
soup = BeautifulSoup(text,"lxml")
link = soup.find("div",class_="omrreport pL10").find("a").get("href")
if link.split("/")[-1] in downloaded:
flag = 0 # if flag = 0 means you found the pdf that you already had
break
pdf_links.append(root_url + link)
return pdf_links,flag
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
all_ = []
for each in yearLinks:
pdf_links = get_pdf(each)
all_ += pdf_links[0]
if not pdf_links[1]:
# flag = 0 break
break
print(all_)
I'm new to python and currently writing an application that scrapes data off the web. It's mostly done, there is only a little problem left with encoding. The site is encoded in ISO-8859-1, but when I try to html.decode('iso-8859-1'), it doesn't do anything.
If you run the program, use 50000 and 50126 for PLZs and you'll see what I mean in the output. It would be awesome if someone could help me out.
import urllib.request
import time
import csv
import operator
from bs4 import BeautifulSoup
#Performs a HTTP-'POST' request, passes it to BeautifulSoup and returns the result
def doRequest(request):
requestResult = urllib.request.urlopen(request)
soup = BeautifulSoup(requestResult)
return soup
#Returns all the result links from the given search parameters
def getLinksFromSearch(plz_von, plz_bis):
database = []
links = []
#The search parameters
params = {
'name_ff': '',
'strasse_ff': '',
'plz_ff': plz_von,
'plz_ff2': plz_bis,
'ort_ff': '',
'bundesland_ff': '',
'land_ff': 'DE',
'traeger_ff': '',
'Dachverband_ff': '',
'submit2' : 'Suchen'
}
DATA = urllib.parse.urlencode(params)
DATA = DATA.encode('utf-8')
request = urllib.request.Request(
"http://www.altenheim-adressen.de/schnellsuche/suche1.cfm",
DATA)
# adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
html = doRequest(request)
h = html.decode('iso-8859-1')
soup = BeautifulSoup(h)
for link in soup.find_all('a'):
database.append(link.get('href'))
#Remove the first Element ('None') to avoid Attribute Errors
database.pop(0)
for item in database:
if item.startswith("suche"):
links.append(item)
return links
#Performs a search on the link results
def searchOnLinks(links):
adresses = []
i = 1
j = len(links)
print("Found", j, "results, collecting data.")
for item in links:
adresses.append(getContactInfoFromPage(item, i, j))
i = i + 1
time.sleep(0.1)
print("All done.")
return adresses
#A method to scrape the contact info from the search result
def getContactInfoFromPage(page, i, j):
name = ''
straße = ''
plz = ''
stadt = ''
telefon = ''
mail = ''
url = ''
data = [
#'Name',
#'Straße',
#'PLZ',
#'Stadt',
#'Telefon',
#'E-Mail',
#'Homepage'
]
request = urllib.request.Request("http://www.altenheim-adressen.de/schnellsuche/" + page)
#request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("Content-Type", "text/html;charset=UTF-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
print("(" , i , "/" , j , ") Making request...")
soup = doRequest(request)
print("Done.")
findeName = soup.findAll('b')
name = findeName[2]
name = name.string.split('>')
data.append(name[0])
straße = getFieldValue(soup, "Straße")
data.append(straße)
ort = getFieldValue(soup, "Ort")
(plz, stadt) = ort.split(' ', 1)
data.append(plz)
data.append(stadt)
telefon = getFieldValue(soup, "Telefon")
data.append(telefon)
mail = getFieldValue(soup, "EMail")
data.append(mail)
url = getFieldValue(soup, "Internetadresse")
data.append(url)
return data
#Strips the text from the given field's sibling
def getFieldValue(soup, field):
field_label = soup.find('td', text=field + ':')
return field_label.find_next_sibling('td').get_text(strip=True)
#The main input/output function
def inputOutput():
#PLZ is German for zip-code and consists of a five-digit number
#The program passes the numbers to the servers, and the server
#returns all search results between the two numbers
plz_von = input("Please enter first PLZ: ")
plz_bis = input("Please enter second PLZ: ")
links = getLinksFromSearch(plz_von, plz_bis)
#Checks if the search yielded any results
if len(links) > 0:
data = searchOnLinks(links)
file_name = input("Save as: ")
print("Writing to file...")
with open(file_name + '.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(data)
else:
print("The search yielded no results.")
inputOutput()
Your doRequest() function returns a BeautifulSoup object, you cannot decode that object. Just use it directly:
soup = doRequest(request)
You don't need to decode the response at all; BeautifulSoup uses both hints in the HTML (<meta> headers) as well as statistical analysis to determine the correct input encoding.
In this case the HTML document claims it is Latin-1:
<meta name="content-type" content="text/html; charset=iso-8859-1">
The response doesn't include a character set in the Content-Type header either, so this is a case of a misconfigured server. You can force BeautifulSoup to ignore the <meta> header with:
soup = BeautifulSoup(requestResult, from_encoding='utf8')