I have developed a script to scrape URL, title and other information
from Amazon best seller categories. The script below is working fine, but it's very slow as Amazon has multiple sub to sub categories so in order to traverse all of the sub categories, it takes so much time,
Is there any thing I can do to make it work fast? I'm using Python 2.7 64 bit
Thanks
import requests
import json
import threading
from bs4 import BeautifulSoup
import re
def GetSoupResponseFromURL(url):
response = requests.get(url, timeout=180)
soup = BeautifulSoup(response.content, 'html.parser')
return soup;
def GetSubCategories(categoryURL):
subCategory = []
soup = GetSoupResponseFromURL(categoryURL)
try:
ul = soup.find('span', {'class':'zg_selected'}).parent.parent.find('ul')
if ul is not None:
subCategories = ul.find_all('a')
for category in subCategories:
catTitle = category.text
url = category.get('href')
lists = soup.find('ul', {'id':'zg_browseRoot'}).find_all('ul')
del lists[-1]
global titleList
titleList = []
for ulist in lists:
text = re.sub(r'[^\x00-\x7F]+','', ulist.find('li').text)
titleList.append(text.strip(' \t\n\r'))
fullTitle = (' > '.join(map(str, titleList)) + ' > ' + catTitle)
soup = GetSoupResponseFromURL(url)
title = soup.find('span', {'class':'category'})
if title is not None:
title = title.text
else:
title = soup.find('div', {'id':'zg_rssLinks'}).find_all('a')[-1].text
title = title[title.index('>') + 2:]
print('Complete Title: ' + fullTitle)
print('Title: ' + title)
print('URL: ' + url)
print('-----------------------------------')
data = {}
data['completeTitle'] = fullTitle
data['title'] = title
data['url'] = url
data['subCategory'] = GetSubCategories(url)
subCategory.append(data)
except Exception, e:
pass
return subCategory
class myThread (threading.Thread):
def __init__(self, threadID, url):
threading.Thread.__init__(self)
self.threadID = threadID
self.url = url
def run(self):
print "Starting Thread " + str(self.threadID)
array = []
array = GetSubCategories(self.url)
with open('Category ' + str(self.threadID) + '.json', 'w') as outfile:
json.dump(array, outfile)
print "Exiting Thread " + str(self.threadID)
mainURL = 'https://www.amazon.fr/gp/bestsellers/ref=zg_bs_unv_petsupplies_0_2036875031_3'
soup = GetSoupResponseFromURL(mainURL)
mainCategories = soup.find('ul', {'id':'zg_browseRoot'}).find_all('a')
print mainCategories
counter = 1
for category in mainCategories[1:2]:
thread = myThread(counter, category.get('href'))
thread.start()
counter+=1
Related
I followed a youtube tutorial on web scraping to scrape this website https://books.toscrape.com/ but i'm getting an empty result
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
all_books = []
url = "http://books.toscrape.com/catalogue/page-1.html"
def get_page(url):
page = requests.get(url)
status = page.status_code
soup = bs(page.text, "lxml")
return [soup, status]
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_info(links):
for listing in listings:
bk_lnk = listing.find("h5").a.get("href")
base_url = "http://books.toscrape.com/catalogue"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1. text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p. text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 1
while True:
url = f"http://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
df = pd.DataFrame(all_books)
print (df)
here's the result am getting
Empty DataFrame
Columns: []
Index: []
my colab notebook link
https://colab.research.google.com/drive/1Lyvwt_WLpE9tqy1qheZg80N70CFSsk-E?usp=sharing
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_links():
for listing in listings:
bk_lnk = listing.find("h3").a.get("href")
base_url = "https://books.toscrape.com/catalogue/"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
return extract_links()
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1.text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p.text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 45
while True:
url = f"https://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
Your list is empty . Need to call your functions .. such as
Get_page(url) which should return a list which you can use soup in your subsequent function ..
class Crawler():
def __init__(self):
self.pag = 1
i = 0
def get_urls(self,main_url):
self.url = 'https://www.test.ro/search/'+ main_url +'/p1'
self.filename = main_url
r = requests.get(self.url)
soup = BeautifulSoup(r.text, 'html.parser')
number_pages = soup.find(class_= 'row' )
last_page = number_pages.find_all('a')[len(number_pages.find_all('a'))-2].get("data-page")
for i in range(1, int(last_page)+1):
url.append('https://www.test.ro/search/'+ main_url +'/p' + str(i))
def print_urls(self):
for urls in url:
print (urls)
def scrape(self,url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
product_list = soup.find(class_ = 'page-container')
product_list_name = product_list.find_all('h2')
product_list_oldprice = product_list.find_all(class_ = 'product-old-price')
product_list_newprice = product_list.find_all(class_ = 'product-new-price')
for i in range(0, len(product_list_name)):
name = product_list_name[i].get_text().strip()
link = product_list_name[i].find('a').get('href')
#print(name)
#print(len(name))
try:
price = product_list_oldprice[i].contents[0].get_text()
price = price[:-6]
#print(price)
except IndexError:
#print("no old price")
#print(product_list_newprice[i].contents[0])
with open(self.filename+'.csv', 'a', encoding = 'utf-8', newline='') as csv_file:
file_is_empty = os.stat(self.filename+'.csv').st_size == 0
fieldname = ['name','link', 'price_old', 'price_actualy']
writer = csv.DictWriter(csv_file, fieldnames = fieldname)
if file_is_empty:
writer.writeheader()
writer.writerow({'name':name,'link':link, 'price_old':price, 'price_actualy':product_list_newprice[i].contents[0]})
if __name__=='__main__':
print("Search for product: ")
urlsearch = input()
starttime = time.time()
scraper = Crawler()
scraper.get_urls(urlsearch)
scraper.print_urls()
#scraper.scrape(url[0])
pool = multiprocessing.Pool()
pool.map(scraper.scrape,url)
pool.close()
print('That took {} seconds'.format(time.time() - starttime))
So I have this scraper, it works perfectly on any website bag but only on the product page.
I did it for a specific website, but how could I go on each page to take the data from the product and give it back and do it all over again?
Is such a thing possible?
I now take the data from the products page, ie name, link, price.
You have divs there too.
Can I help href?
In this case you need to create a category scraper that safes all product urls first. Scrape all urls and go through all the category's and for example safe them to csv first (the product urls). Then you can take all the product urls from the CSV and loop through all of them.
I'm scraping this website but I have a problem after I run the codes where it only print the first events and the for loop is not iterating.
To append Events as List
class Events(object):
def __init__(self, title=None, place=None, date_posted=None, description=None, linkhref=None):
self.title = title
self.place = place
self.date_posted = date_posted
self.description = description
self.linkhref = linkhref
I want to save it in CSV
sv_file = open('scrape2.csv', 'w', encoding="utf-8")
csv_writer = csv.writer(csv_file)
# Columns
csv_writer.writerow(['title', 'link', 'place', 'date_posted', 'description'])
In this part I want to scrape the website from 3 pages and find the details such as date, event name, venue and description of each events.
def scrape():
for page in range(0, 2):
page = page + 1
base_url = 'https://allevents.in/malacca/all?ref=cityhome-popular' + str(page)
source = requests.get(base_url)
soup = BeautifulSoup(source.text, "html.parser")
all_event = soup.find_all('div', class_="event-list listview")
events = []
for item in all_event:
title = item.find("h3").text.strip()
link = item.find("h3")
linkhref = link.find("a").get('href')
place = item.find("p", {"class":"location"}).text.strip()
date_posted = item.find("div", {"class":"right"}).text.strip()
description = item.find("p", {"class":"short-desc"}).text.strip()
csv_writer.writerow([title, link, place, date_posted, description])
events.append(Events(title, link, place, date_posted, description))
This is the for loop to iterate List
for event in events:
print("Title: " + event.title)
print("Link: " + event.linkhref)
print("Place: " + str(event.place))
print("Date: " + event.date_posted)
print("Description: " + event.description)
print("-----------------------------------")
csv_file.close()
return 1
if __name__ == "__main__":
print(scrape())
You can fetch and write the results in different ways. Moreover, you can make use of namedtuple to shake off verbosity. Here is how I would do it.
import csv
import requests
from bs4 import BeautifulSoup
from collections import namedtuple
class Events(object):
def __init__(self):
self.base_url = 'https://allevents.in/malacca/all?ref=cityhome-popular'
self.items = namedtuple('itemDocument', ['title','linkhref','place','date_posted','description'])
def scrape(self):
source = requests.get(self.base_url)
soup = BeautifulSoup(source.text,"lxml")
for item in soup.find_all('div', class_="event-item"):
title = item.find("h3").text.strip()
linkhref = item.find("h3").find("a").get('href')
place = ' '.join(item.find("p", {"class":"location"}).text.split())
date_posted = item.find("div", {"class":"right"}).text.strip()
description = item.find("p", {"class":"short-desc"}).text.strip()
yield self.items(title,linkhref,place,date_posted,description)
if __name__ == "__main__":
scraper = Events()
with open("outputFile.csv","w",newline="",encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(['title','linkhref','place','date_posted','description'])
for item in scraper.scrape():
writer.writerow([item.title,item.linkhref,item.place,item.date_posted,item.description])
Now, you can use the logic of traversing different pages within the above script as I've kicked out for brevity.
Because the 'find' keyword only returns the first tagged element it finds. To return all the elements with specified tag such as "h3" I am not sure but it is "find_all" keyword. Do check it up first. And mark this answer as useful if it solves your issue.
You are almost there; just change the middle section of your code, beginning of the assignment of all_event:
all_event = soup.find_all('h3')
events = []
for item in all_event:
title = item.a.text
linkhref = item.a['href']
place = item.findNext('span').text.strip()
date_posted = item.findNext('div', class_="right").text.strip()
description = item.findNext('p', class_="short-desc").text.strip()
It should work from there, maybe with some modifications.
I am trying to scrape some news. I have a larger list of 3k articles from this site, selected by criteria, and (considering I am new to Python) I came out with this script to scrape them:
import pandas as pd
import bs4
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
# get the URL list
list1 = []
a = 'https://www.dnes.bg/sofia/2019/03/13/borisov-se-pohvali-prihodite-ot-gorivata-sa-sys-7-poveche.404467'
b = 'https://www.dnes.bg/obshtestvo/2019/03/13/pazim-ezika-si-pravopis-pod-patronaja-na-radeva.404462'
c = 'https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091'
list1.append(a)
list1.append(b)
list1.append(c)
# define the variables
#url = "https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091"
list2 = list1 #[0:10]
#type(list2)
href = []
title = []
subtitle = []
time = []
article = []
art1 = []
#
#dd = soup.find("div", "art_author").text
#dd
filename = "scraped.csv"
f = open(filename, "w")
#headers = "href;title;subtitle;time;article\n"
headers = "title;subtitle;time;article\n"
f.write(headers)
for url in list2:
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml').decode('windows-1251')
href = url
title = soup.find("h1", "title").string
#title = soup.find("h1", "title").string
#title.extend(soup.find("h1", "title").string) # the title string
subtitle = soup.find("div", "descr").string
#subtitle.extend(soup.find("div", "descr").string) # the subtitle string
time = soup.find("div", "art_author").text
#time.extend(soup.find("div", "art_author").text)
#par = soup.find("div", id="art_start").find_all("p")
art1.extend(soup.find("div", id="art_start").find_all("p"))
for a in art1:
#article.extend(art1.find_all("p"))
article = ([a.text.strip()])
break
#href = "".join(href)
title = "".join(title)
subtitle = "".join(subtitle)
time = "".join(time)
article = "".join(article)
#f.write(href + ";" + title + ";" + subtitle + ";" + time + ";" + article + "\n")
f.write(title + ";" + subtitle + ";" + time + ";" + article +"\n")
f.close()
The main problem for now is that I get an error:
File "<ipython-input-12-9a796b182a82>", line 24, in <module>
title = soup.find("h1", "title").string
TypeError: slice indices must be integers or None or have an __index__ method
I can't really find a solution to this.
And the second problem is whenever I succeed scraping one site, some empty cells occur, which means I have to find a way through Ajax.
I use the Anaconda version 2018.12.
Something I stumbled upon ([here] https://www.youtube.com/watch?v=FSH77vnOGqU ):
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('https://pythonprogramming.net/parsememcparseface/')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find('p', class_='jstest')
print(js_test.text)
if __name__ == '__main__': main()
Ok. I fixed the issue with your soup object stored as a string, so you can use bs4 to parse the html. I also opted to use pandas .to_csv(), as I'm just more familiar with it, but it gets you the desired output:
import pandas as pd
from bs4 import BeautifulSoup
import requests
# get the URL list
list1 = []
a = 'https://www.dnes.bg/sofia/2019/03/13/borisov-se-pohvali-prihodite-ot-gorivata-sa-sys-7-poveche.404467'
b = 'https://www.dnes.bg/obshtestvo/2019/03/13/pazim-ezika-si-pravopis-pod-patronaja-na-radeva.404462'
c = 'https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091'
list1.append(a)
list1.append(b)
list1.append(c)
# define the variables
#url = "https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091"
list2 = list1 #[0:10]
#type(list2)
results = pd.DataFrame()
for url in list2:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
href = url
title = soup.find("h1", "title").text
#title = soup.find("h1", "title").string
#title.extend(soup.find("h1", "title").string) # the title string
subtitle = soup.find("div", "descr").text
#subtitle.extend(soup.find("div", "descr").string) # the subtitle string
time = soup.find("div", "art_author").text
#time.extend(soup.find("div", "art_author").text)
#par = soup.find("div", id="art_start").find_all("p")
art1 = soup.find("div", id="art_start").find_all("p")
article = []
for a in art1:
if 'googletag.cmd.push' not in a.text:
article.append(a.text.strip())
article = ' '.join(article)
temp_df = pd.DataFrame([[title, subtitle, time, article]], columns = ['title','subtitle','time','article'])
results = results.append(temp_df).reset_index(drop=True)
results.to_csv("scraped.csv", index=False, encoding='utf-8-sig')
Output:
print (results.to_string())
title subtitle time article
0 Борисов се похвали: Приходите от горивата са с... Мерките за изсветляване на сектора действат, к... Обновена: 13 мар 2019 13:24 | 13 мар 2019 11:3... Приходите от горивата са със 7% повече. Това с...
1 "Пазим езика си": Правопис под патронажа на Ра... Грамотността зависи не само от училището, смят... Обновена: 13 мар 2019 11:34 | 13 мар 2019 11:2... За втора поредна година Сдружение "Живата вода...
2 Политиката – "неканен гост" на празничната нов... Основателни ли бяха критиките на президента Ру... 3 яну 2019 10:45, Цветелин Димитров Оказа ли се политиката "неканен гост" на празн...
I have a script that loops through multiple pages of a website and I want to skip over or add a blank space for the item that might not be on certain pages. For example, there are some pages that do not contain a description about the book. When I run into one of those pages I get an attribute error. My script below loops through the first two pages with no problem, but when it hits the third page it stops.
Here is the traceback
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/beautifulsoup4-4.6.0-py3.6.egg/bs4/element.py", line 737, in __getattr__ AttributeError: 'NavigableString' object has no attribute 'text'
How can I fix this? Here is my script:
from bs4 import BeautifulSoup as soup
import requests
import json
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
#html parsing
page_soup = soup(response.content, "html5lib")
#grabs info for each textbook
containers = page_soup.findAll("div",{"class":"LongDescription"})
author = page_soup.select("p")
about = page_soup.find("div",{"id":"AboutBook"})
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find("div",{"class":"twothird"}).h1.text
item['author'] = author[3].get_text(separator=', ')
if item['author'] == " ":
item['author'] = "University of Minnesota Libraries Publishing"
item['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
if not container.find(string="Publisher: "):
item['publisher_url'] = item['publisher'] = ""
else:
item['publisher'] = container.find(text="Publisher: ").nextSibling.text
item['publisher_url'] = container.find(text="Publisher: ").nextSibling['href']
item['source'] = "Open Textbook Library"
if not about.h2.nextSibling.nextSibling.nextSibling:
item['description'] = ""
else:
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text
item['base_url'] = "https://open.umn.edu/opentextbooks/"
if container.find("p",{"class":"Badge-Condition"}).a:
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license'] = ''
if container.find("p",{"class":"Badge-Condition"}).a:
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license_url'] = ''
if container.find("div",{"class":"twothird"}).p:
item['review'] = container.find("div",{"class":"twothird"}).p.text
else:
item['review'] = ''
if item['review'].startswith('('):
item['review'] = item['review'].replace('(', '')
if item['review'].endswith(' reviews)'):
item['review'] = item['review'].replace(' reviews)', '')
if item['review'] > str(0):
item['review'] = "Reviewed Resource"
else:
item['review'] = ''
item['image_url'] = "https://open.umn.edu/opentextbooks/" + container.img["src"]
data.append(item) # add the item to the list
with open("./json/otl-1.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
I wouldn't recommend parsing the description with item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text, that's too much specific. I came up with this code:
from bs4 import BeautifulSoup as soup
import requests
import json
from pprint import pprint
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
page_soup = soup(response.content, "lxml")
data = {}
title, author, description = page_soup.select('h1')[0].text, \
page_soup.select('h1 ~ p')[3].get_text(', '), \
'\n'.join(p.text.strip() for p in page_soup.select('div#AboutBook > p') if p.text.strip())
data['type'] = "Textbook"
data['title'] = title
data['author'] = author if author.strip() else "University of Minnesota Libraries Publishing"
data['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
data['source'] = "Open Textbook Library"
data['description'] = description
pprint(data)
# with open("./json/otl-1.json", "w") as writeJSON:
# json.dump(data, writeJSON, ensure_ascii=False)
Prints:
{'author': 'University of Minnesota Libraries Publishing',
'description': 'This book is intended for an undergraduate or MBA level '
'Financial Accounting course. It covers the standard topics in '
'a standard sequence, utilizing the Socratic method of asking '
'and answering questions.',
'link': 'https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=4',
'source': 'Open Textbook Library',
'title': 'Financial Accounting',
'type': 'Textbook'}
...and so on (for each book)
Wherever you are getting the AttributeError you can use the following code:
Try:
your code here
except AttributeError:
pass or other codes