Get data from product page and back Scraper - python

class Crawler():
def __init__(self):
self.pag = 1
i = 0
def get_urls(self,main_url):
self.url = 'https://www.test.ro/search/'+ main_url +'/p1'
self.filename = main_url
r = requests.get(self.url)
soup = BeautifulSoup(r.text, 'html.parser')
number_pages = soup.find(class_= 'row' )
last_page = number_pages.find_all('a')[len(number_pages.find_all('a'))-2].get("data-page")
for i in range(1, int(last_page)+1):
url.append('https://www.test.ro/search/'+ main_url +'/p' + str(i))
def print_urls(self):
for urls in url:
print (urls)
def scrape(self,url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
product_list = soup.find(class_ = 'page-container')
product_list_name = product_list.find_all('h2')
product_list_oldprice = product_list.find_all(class_ = 'product-old-price')
product_list_newprice = product_list.find_all(class_ = 'product-new-price')
for i in range(0, len(product_list_name)):
name = product_list_name[i].get_text().strip()
link = product_list_name[i].find('a').get('href')
#print(name)
#print(len(name))
try:
price = product_list_oldprice[i].contents[0].get_text()
price = price[:-6]
#print(price)
except IndexError:
#print("no old price")
#print(product_list_newprice[i].contents[0])
with open(self.filename+'.csv', 'a', encoding = 'utf-8', newline='') as csv_file:
file_is_empty = os.stat(self.filename+'.csv').st_size == 0
fieldname = ['name','link', 'price_old', 'price_actualy']
writer = csv.DictWriter(csv_file, fieldnames = fieldname)
if file_is_empty:
writer.writeheader()
writer.writerow({'name':name,'link':link, 'price_old':price, 'price_actualy':product_list_newprice[i].contents[0]})
if __name__=='__main__':
print("Search for product: ")
urlsearch = input()
starttime = time.time()
scraper = Crawler()
scraper.get_urls(urlsearch)
scraper.print_urls()
#scraper.scrape(url[0])
pool = multiprocessing.Pool()
pool.map(scraper.scrape,url)
pool.close()
print('That took {} seconds'.format(time.time() - starttime))
So I have this scraper, it works perfectly on any website bag but only on the product page.
I did it for a specific website, but how could I go on each page to take the data from the product and give it back and do it all over again?
Is such a thing possible?
I now take the data from the products page, ie name, link, price.
You have divs there too.
Can I help href?

In this case you need to create a category scraper that safes all product urls first. Scrape all urls and go through all the category's and for example safe them to csv first (the product urls). Then you can take all the product urls from the CSV and loop through all of them.

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:
The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

Python web scraping empty result

I followed a youtube tutorial on web scraping to scrape this website https://books.toscrape.com/ but i'm getting an empty result
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
all_books = []
url = "http://books.toscrape.com/catalogue/page-1.html"
def get_page(url):
page = requests.get(url)
status = page.status_code
soup = bs(page.text, "lxml")
return [soup, status]
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_info(links):
for listing in listings:
bk_lnk = listing.find("h5").a.get("href")
base_url = "http://books.toscrape.com/catalogue"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1. text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p. text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 1
while True:
url = f"http://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
df = pd.DataFrame(all_books)
print (df)
here's the result am getting
Empty DataFrame
Columns: []
Index: []
my colab notebook link
https://colab.research.google.com/drive/1Lyvwt_WLpE9tqy1qheZg80N70CFSsk-E?usp=sharing
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_links():
for listing in listings:
bk_lnk = listing.find("h3").a.get("href")
base_url = "https://books.toscrape.com/catalogue/"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
return extract_links()
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1.text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p.text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 45
while True:
url = f"https://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
Your list is empty . Need to call your functions .. such as
Get_page(url) which should return a list which you can use soup in your subsequent function ..

Script crawls only the first page instead of multiple pages

I am trying to crawl multiple pages of a website. But the program can only crawl the first page.
import requests
from bs4 import BeautifulSoup
import re
import json
import time
def make_soup(url):
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
jsonData = pattern.search(script.text).group(1)
pattern_number = re.compile(r'\"[0-9]{9,12}\":(\{\"data\":\{\"cachedFilters\":(.*?)\}\}),\"[0-9]{9,11}\"')
jsonData2 = pattern_number.search(jsonData).group(1)
dictData = json.loads(jsonData2)
return dictData
def get_reviews(dictData):
""" Return a list of five dicts with reviews.
"""
all_dictionaries = []
for data in dictData['data']['locations']:
for reviews in data['reviewListPage']['reviews']:
review_dict = {}
review_dict["reviewid"] = reviews['id']
review_dict["reviewurl"] = reviews['absoluteUrl']
review_dict["reviewlang"] = reviews['language']
review_dict["reviewdate"] = reviews['createdDate']
userProfile = reviews['userProfile']
review_dict["author"] = userProfile['displayName']
all_dictionaries.append(review_dict)
return all_dictionaries
def main():
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list = get_reviews(dictData) # list with five dicts
#print(review_list)
page_number = 5
while page_number <= 260: # number in the URL
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list2 = get_reviews(dictData)
print(review_list2)
page_number += 5
time.sleep(0.5)
if __name__ == "__main__":
main()
And I'm not sure if I can crawl multiple pages with this URL. On the website there are 54 pages, but in the URL I always have to add the number 5, like this:
Page 1
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS
Page2
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or5-Coronado_Hotel-Zurich.html#REVIEWS
Page3
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or10-Coronado_Hotel-Zurich.html#REVIEWS
I don't know if this is a good idea.
Do you have any suggestions? Thank you in advance!
You assing new url to next_url but you use url to read page.
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
You have to rename variable
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)

How to make request to new url?

I already have this code, helped by a friend before. I already get all the links in the site. I want to get the name, merk, price, picture, description of the product, and the link of the product. The description's product only appear if we click the product.
I'm a beginner in Python.
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
merk = []
price = []
pic = []
description = []
while (response.url == expected_url):
#print("GET {0}".format(expected_url))
soup = BeautifulSoup(response.read(), "html.parser")
products = soup.find("div",{"id":"product-list-grid"})
for i in products:
data = products.findAll("div",{"class":"product-item"})
for j in range(0, len(data)):
link.append(data[j]["data-eec-href"])
count += 1
expected_url = url % count
response = get_url(expected_url)
print(len(link))
"""
import csv
dataset=zip(link, merk, name, pic, price, description)
with open("foundation_sociolla.csv","w", newline='') as csvfile:
writer=csv.writer(csvfile)
header=['link', 'merk', 'name', 'pic', 'price', 'description']
writer.writerow(header)
writer.writerows(dataset)
"""
You need to make a request to the URL. Parse the content of that request and extract the data you want.
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
make = []
price = []
pic = []
description = []
while response.url == expected_url:
soup = BeautifulSoup(response.read(), "html.parser")
for product in soup.select("div.product-item"):
product_url = (product['data-eec-href'])
link.append(product_url)
product_response = get_url(product_url)
product_soup = BeautifulSoup(product_response.read(), "html.parser")
product_pic = product_soup.select('img#bigpic')[0]['src']
pic.append(product_pic)
product_price = product_soup.select('span#our_price_display')[0].text.strip()
price.append(product_price)
product_name = product_soup.select('div.detail-product-logo p')[0].text.strip()
name.append(product_name)
product_make = product_soup.select('div.detail-product-logo h3')[0].text.strip()
make.append(product_make)
product_description = product_soup.select('div#Details article')[0].text.strip()
description.append(product_description)
print(product_url, product_pic, product_price, product_name, product_make, product_description)
count += 1
expected_url = url % count
response = get_url(expected_url)
But if your going to scrape a lot of pages you are much better off using something like Scrapy https://scrapy.org/

BeautifulSoup get links and info inside of them

I would like to scrape a website. Website has 10 preview of complaints in each page. I wrote this script to get links of 10 complaints and some info inside of each link. When I run the script I got this error message "RecursionError: maximum recursion depth exceeded".
Can someone say to me what is the problem. Thank you in advance!!
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# Create list objects for each information section
C_date = []
C_title = []
C_text = []
U_name = []
U_id = []
C_count = []
R_name = []
R_date = []
R_text = []
# Get 10 links for preview of complaints
def getLinks(url):
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
c_containers = html_soup.find_all('div', class_='media')
# Store wanted links in a list
allLinks = []
for link in c_containers:
find_tag = link.find('a')
find_links = find_tag.get('href')
full_link = "".join((url, find_links))
allLinks.append(full_link)
# Get total number of links
print(len(allLinks))
return allLinks
def GetData(Each_Link):
each_complaint_page = get(Each_Link)
html_soup = BeautifulSoup(each_complaint_page.text, 'html.parser')
# Get date of complaint
dt = html_soup.main.find('span')
date = dt['title']
C_date.append(date)
# Get Title of complaint
TL = html_soup.main.find('h1', {'class': 'title'})
Title = TL.text
C_title.append(Title)
# Get main text of complaint
Tx = html_soup.main.find('div', {'class': 'description'})
Text = Tx.text
C_text.append(Text)
# Get user name and id
Uname = html_soup.main.find('span', {'class': 'user'})
User_name = Uname.span.text
User_id = Uname.attrs['data-memberid']
U_name.append(User_name)
U_id.append(User_id)
# Get view count of complaint
Vcount = html_soup.main.find('span', {'view-count-detail'})
View_count = Vcount.text
C_count.append(View_count)
# Get reply for complaint
Rpnm = html_soup.main.find('h4', {'name'})
Reply_name = Rpnm.next
R_name.append(Reply_name)
# Get reply date
Rpdt = html_soup.main.find('span', {'date-tips'})
Reply_date = Rpdt.attrs['title']
R_date.append(Reply_date)
# Get reply text
Rptx = html_soup.main.find('p', {'comment-content-msg company-comment-msg'})
Reply_text = Rptx.text
R_text.append(Reply_text)
link_list = getLinks('https://www.sikayetvar.com/arcelik')
for i in link_list:
z = GetData(i)
print(z)
PS: My next step will be to put all information in a data frame
Your GetData() method calls itself, with no base-case: this causes infinite recursion:
def GetData(data):
for i in GetData(data):
You're also calling response = get(i) but then ignoring the result... perhaps you meant to say
def GetData(link):
i = get(link)
...

Categories