Python scraping data of multiple pages issue - python

I'm getting one issue my code scrape everything from only the first page. But I want to scrape data of multiple pages same as from the first page. Actully I also wrote a code for multiple pages and it also move forward to next page but scrape data of first page again. please have a look at my code and gude me how can i fix this issue. thanks!
here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="cdm_style",id=False).text)
except:
title = 'Empty Title'
try:
collection = (soup.find('td',id="metadata_collec").find('a').text)
except:
collection = "Empty Collection"
try:
author = (soup.find('td',id="metadata_creato").text)
except:
author = "Empty Author"
try:
abstract = (soup.find('td',id="metadata_descri").text)
except:
abstract = "Empty Abstract"
try:
keywords = (soup.find('td',id="metadata_keywor").text)
except:
keywords = "Empty Keywords"
try:
publishers = (soup.find('td',id="metadata_publis").text)
except:
publishers = "Empty Publishers"
try:
date_original = (soup.find('td',id="metadata_contri").text)
except:
date_original = "Empty Date original"
try:
date_digital = (soup.find('td',id="metadata_date").text)
except:
date_digital = "Empty Date digital"
try:
formatt = (soup.find('td',id="metadata_source").text)
except:
formatt = "Empty Format"
try:
release_statement = (soup.find('td',id="metadata_rights").text)
except:
release_statement = "Empty Realease Statement"
try:
library = (soup.find('td',id="metadata_librar").text)
except:
library = "Empty Library"
try:
date_created = (soup.find('td',id="metadata_dmcreated").text)
except:
date_created = "Empty date Created"
data = {
'Title' : title.strip(),
'Collection' : collection.strip(),
'Author' : author.strip(),
'Abstract' : abstract.strip(),
'Keywords' : keywords.strip(),
'Publishers' : publishers.strip(),
'Date_original': date_original.strip(),
'Date_digital' : date_digital.strip(),
'Format' : formatt.strip(),
'Release-st' : release_statement.strip(),
'Library' : library.strip(),
'Date_created' : date_created.strip()
}
return data
def get_index_data(soup):
try:
titles_link = soup.find_all('a',class_="body_link_11")
except:
titles_link = []
else:
titles_link_output = []
for link in titles_link:
try:
item_id = link.attrs.get('item_id', None) #All titles with valid links will have an item_id
if item_id:
titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
except:
continue
return titles_link_output
def write_csv(data,url):
with open('1111_to_5555.csv','a') as csvfile:
writer = csv.writer(csvfile)
row = [data['Title'], data['Collection'], data['Author'],
data['Abstract'], data['Keywords'], data['Publishers'], data['Date_original'],
data['Date_digital'], data['Format'], data['Release-st'], data['Library'],
data['Date_created'], url]
writer.writerow(row)
def main():
for x in range(2,4):
mainurl = ("http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/")
print(x)
url = f"{mainurl}{x}"
products = get_index_data(get_page(url))
for product in products:
data1 = get_detail_page(get_page(product))
write_csv(data1,product)
if __name__ == '__main__':
main()

in get_page() function, try to add headers on upon requests
def get_page(url):
headers = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
response = requests.get(url, headers=headers)

Related

When I use Beautiful Soup library find_all it finds only half of what I ask

URL_ZILLOW = "https://www.zillow.com/manhattan-new-york-ny/rentals/1-1_beds/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Manhattan%2C%20New%20York%2C%20NY%22%2C%22mapBounds%22%3A%7B%22west%22%3A-74.08612384255531%2C%22east%22%3A-73.66315020974281%2C%22south%22%3A40.65707407862896%2C%22north%22%3A40.911828962172066%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A12530%2C%22regionType%22%3A17%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A472874%7D%2C%22mp%22%3A%7B%22max%22%3A1700%7D%2C%22beds%22%3A%7B%22min%22%3A1%2C%22max%22%3A1%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22fr%22%3A%7B%22value%22%3Atrue%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D "
response = requests.get(URL_ZILLOW, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
prices = [price.getText() for price in self.soup.find_all("div", "list-card-price")]
addresses = [address.getText() for address in self.soup.find_all(name="address", class_="list-card-addr")]
links = [link['href'] for link in self.soup.find_all(name="a", class_="list-card-link")]
You are needlessly scraping HTML when there is a backend api that can be hit, open your browsers Developer Tools - Network - fetch/XHR and then refresh the page. You'll see a network request to their backend api, you can copy it into a new tab in your browser and see the json data, very easy to scrape:
import requests
import pandas as pd
paste_network_tab_page_2_url = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A2%7D%2C%22usersSearchTerm%22%3A%22Philadelphia%2C%20PA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-75.62199789164843%2C%22east%22%3A-74.70875936625781%2C%22south%22%3A39.528020026936595%2C%22north%22%3A40.41625050351642%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sortSelection%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22isAllHomes%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%7D&wants={%22cat1%22:[%22listResults%22,%22mapResults%22],%22cat2%22:[%22total%22]}&requestId=11'
headers = {
'accept':'application/json',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
}
final = []
for page in range(1,5): #edit for page numbers, you might get banned if you go wild
print(f'Scraping page: {page}')
url = paste_network_tab_page_2_url.replace('currentPage%22%',f'currentPage%2{page}%')
resp = requests.get(url,headers=headers).json()
for result in resp['cat1']['searchResults']['listResults']:
detected = resp['user']['isBot']
_id = result['id']
url = result['detailUrl']
img = result['imgSrc']
_type = result['statusType']
price = result['unformattedPrice']
address = result['address']
try:
_zip = result['addressZipcode']
except KeyError:
_zip = ''
try:
beds = result['beds']
except KeyError:
beds = ''
try:
baths = result['baths']
except KeyError:
baths = ''
try:
lat = result['latLong']['latitude']
lng = result['latLong']['longitude']
except KeyError:
lat = ''
lng = ''
try:
age = result['variableData']['text']
except KeyError:
age =''
try:
area = result['area']
except:
area = ''
item = {
'detected?lol' : detected,
'id' : _id,
'url' : url,
'img' : img,
'type' : _type,
'price' : price,
'address' : address,
'zip' : _zip,
'beds' : beds,
'baths' : baths,
'lat' : lat,
'lng' : lng,
'age' : age,
'area' : area
} #lots more data than this in the file, go have a look at the contents of 'print(resp['cat1']['searchResults']['listResults'])'
final.append(item)
df = pd.DataFrame(final)
df.to_csv('zillow_results.csv',index=False)
print('results saved in file: zillow_results.csv')

Yellow Pages Scraper in Python Not working

I am trying to scrape data from Yellow Pages. I used this scraper many times, but it has recently stopped.
Got this error
'NoneType' object has no attribute 'group' 0 results found
can anyone please help me to fix this problem
Where am I going wrong on this?
import requests
import requests_random_user_agent
import urllib.parse
from bs4 import BeautifulSoup
import re
from math import ceil
import csv
import os
import sys
import subprocess
from os import system, name
import time
from tqdm import tqdm
class Scraper:
def __init__(self,keyword,location):
self.keyword=keyword
self.location=location
self.params = urllib.parse.urlencode({"search_terms": self.keyword, "geo_location_terms": self.location})
def get_info(self, link):
try:
r = requests.get(link)
html = BeautifulSoup(r.content, "html.parser")
except:
return False
try:
name = html.find('h1').text
except:
name = None
try:
phone = html.find(class_='phone').text
except:
phone = None
try:
website = html.find('a',class_='primary-btn website-link')["href"]
if len(website.split("?")) > 1:
website = website.split("?")[0]
except:
website = None
try:
email = html.find('a', class_='email-business')["href"].split(":")[1]
except:
email=None
try:
address = html.find('h2',class_='address').text
except:
address=None
return {"name": name, "email": email, "phone": phone, "address": address, "website":website}
def get_num_pages(self):
try:
url = f"https://www.yellowpages.com/search?{self.params}"
response = requests.get(url)
html = BeautifulSoup(response.content, "html.parser")
pagination = html.find(class_="pagination")
if not pagination:
pagination = 1
links = html.select("a[class='business-name']")
num_results = 0
for l in links:
try:
l["data-analytics"]
num_results += 1
except:
continue
return num_results, pagination
num_results = int(re.search('We found(.*?)results',pagination.text).group(1))
return num_results, int(ceil(int(num_results) / 30))
except Exception as e:
print(e)
return False, False
def get_links(self, page):
try:
url = f"https://www.yellowpages.com/search?{self.params}&page={page}"
response = requests.request("GET", url, timeout=10)
html = BeautifulSoup(response.content, "html.parser")
links = html.select("a[class='business-name']")
links_filtered=[]
for l in links:
try:
l["data-analytics"]
links_filtered.append(l)
except:
continue
links_list = []
for link in links_filtered:
links_list.append(f"https://www.yellowpages.com{link['href']}")
return links_list
except Exception as e:
print(e)
return []
def open_file(filename):
try:
if sys.platform == "win32":
os.startfile(filename)
else:
opener = "open" if sys.platform == "darwin" else "xdg-open"
subprocess.call([opener, filename])
except:
return False
def create_csv(elements):
row_list = [["Name", "Address", "Phone", "Email", "Website"]]
for e in elements:
name = e["name"]
address = e["address"]
phone = e["phone"]
email = e["email"]
website = e["website"]
row_list.append([name, address, phone, email, website])
with open('output.csv', 'w', newline='', encoding='utf8') as file:
writer = csv.writer(file)
writer.writerows(row_list)
def clear():
# for windows
if name == 'nt':
_ = system('cls')
# for mac and linux(here, os.name is 'posix')
else:
_ = system('clear')
def main():
clear()
try:
while True:
keyword = input("Keyword: ")
if keyword != "":
break
while True:
city = input("City: ")
if city != "":
break
clear()
scraper = Scraper(keyword, city)
results, num_pages = scraper.get_num_pages()
if not results:
print("0 results found")
return False
print(f"{results} results found {keyword} - {city}")
data = []
pages = tqdm(range(1, num_pages + 1))
for page in pages:
clear()
try:
pages.set_description(f"Scraping page {page}/{num_pages}...")
links = scraper.get_links(page)
if not (len(links) > 0):
continue
links = tqdm(links)
for link in links:
try:
links.set_description(f"Scraping {link}")
info = scraper.get_info(link)
# print(info)
data.append(info)
create_csv(data)
except:
continue
except:
continue
print("Opening file...")
open_file("output.csv")
print("Task finished")
except:
return False
if __name__ == "__main__":
main()
It fails on the line
num_results = int(re.search('We found(.*?)results',pagination.text).group(1))
A very simple check of the search results page, by opening the browser, would have shown you that the text "We found x results" is not present on the page. So re.search returns None, even if there are many results.
Adjust your script to work without num_pages and only paginate via the page links at the bottom or by incrementing the page= parameter in the URL until no more results/pages are listed.
FYI, next time, put in some minimal debugging effort and not post your entire script.

Python Error with scraping Forum for Title and URL

I want to scrape the title and the URL of each Posting at the Forum of the URL, so that when a new Post is created with 1 of the Titles below i'd like to receive a Mail with that Link of the Post.
Please do not be so harsh with me i'm a beginner with Python and Scraping
I have multiple Problems.
1: at the While(True) Function the "soup" is red underlined with the Error: Undefined variable 'soup'
2: When commenting out the While(True) Function then the Program will not run. I get no error.
3: When there is a new Posting with one of my Criterias, how do I get the URL of that Post?
Titles
def Jeti_DC_16
def Jeti_DC_16_v2
def Jeti_DS_16
def Jeti_DS16_v2
My FullCode
from requests import get
from bs4 import BeautifulSoup
import re
import smtplib
import time
import lxml
import pprint
import json
URL = 'https://www.rc-network.de/forums/biete-rc-elektronik-zubeh%C3%B6r.135/'
def scrape_page_metadata(URL):
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
pp = pprint.PrettyPrinter(indent=4)
response = get(URL, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
metadata = {
'Jeti_DC_16': Jeti_DC_16(soup, URL),
'jeti_dc_16_2': Jeti_DC_16_v2(soup, URL),
'jeti_ds_16': Jeti_DS_16(soup, URL),
'jeti_ds_16_2': Jeti_DS_16_v2(soup, URL)
}
pp.pprint(metadata)
return metadata
def Jeti_DC_16(soup, URL):
jeti_dc_16 = None
if soup.name.string:
jeti_dc_16 = soup.title.string
elif soup.find_all("div", class_='structItem-title'):
jeti_dc_16 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_dc_16 = URL.split('//')[1]
return jeti_dc_16.split('/')[0].rsplit('.')[1].capitalize()
return jeti_dc_16
def Jeti_DC_16_v2(soup, URL):
jeti_dc_16_v2 = None
if soup.name.string:
jeti_dc_16_v2 = soup.title.string
elif soup.find_all("div", class_='structItem-title'):
jeti_dc_16_v2 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_dc_16_v2 = URL.split('//')[1]
return jeti_dc_16_v2.split('/')[0].rsplit('.')[1].capitalize()
return jeti_dc_16_v2
def Jeti_DS_16(soup, URL):
jeti_ds_16 = None
if soup.jeti_ds_16.string:
jeti_ds_16 = soup.jeti_ds_16.string
elif soup.find_all("div", class_='structItem-title'):
jeti_ds_16 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_ds_16 = URL.split('//')[1]
return jeti_ds_16.split('/')[0].rsplit('.')[1].capitalize()
return jeti_ds_16
def Jeti_DS_16_v2(soup, URL):
jeti_ds_16_v2 = None
if soup.name.string:
jeti_ds_16_v2 = soup.title.string
elif soup.find_all("div", class_='structItem-title'):
jeti_ds_16_v2 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_dc_16_v2 = URL.split('//')[1]
return jeti_dc_16_v2.split('/')[0].rsplit('.')[1].capitalize()
return jeti_ds_16_v2
# search_for_class = soup.find_all(
# 'div', class_='structItem-title')
# Jeti_DS_16 = soup.find_all(text="Jeti DS 16")
# Jeti_DS_16_v2 = soup.find_all(text="Jeti DS 16 2")
# Jeti_DC_16 = soup.find_all(text="Jeti DC 16")
# Jeti_DC_16_v2 = soup.find_all(text="Jeti DC 16 2")
if(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2):
send_mail()
# # print('Die Nummer {0} {1} {2} {3} wurden gezogen'.format(
# # Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2))
# for i in soup.find_all('div', attrs={'class': 'structItem-title'}):
# print(i.a['href'])
# first_result = search_for_class[2]
# print(first_result.text)
# print(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2)
def send_mail():
with open('/Users/blackbox/Desktop/SynologyDrive/Programmieren/rc-network/credentials.json', 'r') as myFile:
data = myFile.read()
obj = json.loads(data)
print("test: " + str(obj['passwd']))
server_ssl = smtplib.SMTP_SSL('smtp.gmail.com', 465)
server_ssl.ehlo()
# server.starttls()
# server.ehlo()
server_ssl.login('secure#gmail.com', 'secure')
subject = 'Es gibt ein neuer Post im RC-Network auf deine gespeicherte Anfragen. Sieh in dir an{Link to Post}'
body = 'Sieh es dir an Link: https://www.rc-network.de/forums/biete-rc-elektronik-zubeh%C3%B6r.135/'
msg = f"Subject: {subject}\n\n{body}"
emails = ["secure#gmx.de"]
server_ssl.sendmail(
'secure#gmail.com',
emails,
msg
)
print('e-Mail wurde versendet!')
# server_ssl.quit
while(True):
Jeti_DC_16(soup, URL)
Jeti_DC_16_v2(soup, URL)
Jeti_DS_16(soup, URL)
Jeti_DS_16_v2(soup, URL)
time.sleep(10)
# time.sleep(86400)
You create soup inside scrape_page_metadata and it is local varible which doesn't exist outside scrape_page_metadata. In while-loop you should rather use scrape_page_metadata() instead of functions Jeti_DC_16(), Jeti_DC_16_v2(), Jeti_DS_16(), Jeti_DS_16_v2()
And this functions gives you metadata which you should check instead of if(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2)
More or less (you have to use correct value in place of ... because I don't know what you want to compare)
while True:
metadata = scrape_page_metadata(URL)
if metadata["Jeti_DC_16"] == ... and metadata["Jeti_DC_16_v2"] == ... and metadata["Jeti_DS_16"] == ... and metadata["Jeti_DS_16_v2"] == ...:
send_mail()
time.sleep(10)
But there are other problems
All your functions Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2 look the same and probably they return the same element. You could use one of them and delete others. Or you should change them and they should search different elements.
Probably you would have to use more print() to see values in variables and which part of code is executed because I think this code needs a lot changes yet.
For example find_all() gives list with results and you can't use get() which needs single element. You need for-loop to get all titles from all elements
More or less
jeti_ds_16_v2 = soup.find_all("div", class_='structItem-itle')
jeti_ds_16_v2 = [item.get('text') for item in jeti_ds_16_v2]

Newbie: Python "AttributeError: 'NoneType' object has no attribute 'text' " when scraping Tripadvisor Reviews

I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
It worked (though, removing the attribute "language") for one link but it doesn't work for any more link (for example.)
I'm receiving the error:
> Traceback (most recent call last):
> File "<pyshell#27>", line 4, in <module>
> items = scrape(url)
> File "<pyshell#12>", line 11, in scrape
> items = parse(session, url + '?filterLang=' + lang)
> File "<pyshell#15>", line 12, in parse
> num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
> AttributeError: 'NoneType' object has no attribute 'text'
I'm attaching the code here with the changes I made in case someone can help me.
Thank you so much!
Silvia
--
I substituted the original:
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
with
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
With the original code I get the error
ValueError: invalid literal for int() with base 10: '5.695'
(where 5.695 is the number of reviews in the page)
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', id='HEADING').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
lang = 'it'
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
I realized the problem lies in the source code.
hotel_name = soup.find('h1', id='HEADING').text
found no target id in the source website. I substituted it with:
hotel_name = soup.find('h1', class_='heading').text
I hope it can help others!

Scrape page using Python requests

I have some problems with web scraping, here is my code:
from bs4 import BeautifulSoup
import requests
import re
import csv
import argparse
def save_csv_file(filename, array):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["item_name","item_price","item_category"])
writer.writerows(array)
def process_data(name, price, category):
item_name = name.text if name else 'NA'
item_price = price.text if price else 'NA'
item_category = category.text if category else 'NA'
item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
return (item_name, item_price, item_category)
def do_scrap(filename, url, payload, headers):
# Request the URL with parameters and headers
r = requests.post(url, payload, headers = headers, allow_redirects = True)
if(r.status_code == 200):
# Save response content in html variable
html = r.content
# Parsed html variable into HTML file with bs4
parsed_html = BeautifulSoup(html, "html.parser")
# Print document title
print parsed_html.head.find('title').text
# Find all of the HTML elements which are describing hotels
tables = parsed_html.find_all("a", {"class" : "result-link"})
# Print the numbers of the hotels
print "Found %s records." % len(tables)
# Empty helpers
items = []
count = 0
# Looping the HTML elements and print properties for each hotel
for table in tables:
name = table.find("h3", {"class" : "result-title"})
price = table.find("p", {"class" : "price text-truncate"})
category = table.find("p", {"class" : "merchant-name text-truncate"})
items.append(process_data(name, price, category))
count += 1
if count > 0:
# Save array with data to csv file
save_csv_file(filename = filename, array = items)
# Print end of job info
print "\n%s records downloaded and saved to %s." % (count, filename)
else:
print "Code error: %s" % r.status_code
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")
args = vars(ap.parse_args())
product = args['product']
category = args['category']
payload = {
'siteSearchQuery':product,
'from':'colibri'
}
headers = {
'Host':'www.kelkoo.co.uk',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}
url = "http://www.kelkoo.co.uk/ctl/do/search"
filename = "%s_co_uk_kelkoo_data.csv" % product
do_scrap(
filename=filename,
url=url,
payload=payload,
headers=headers)
After this request I am getting different result than I put this:
www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri
into my web browser, what is causing this problem? Is there is something related to page redirection or something?
I can see multiple things that will cause you to get different results:
You initiate a POST not a GET. Lookup params for requests.get.
They use javascript to modify the page.

Categories