Skip Over Item if element doesn't exist on page - python

I have a script that loops through multiple pages of a website and I want to skip over or add a blank space for the item that might not be on certain pages. For example, there are some pages that do not contain a description about the book. When I run into one of those pages I get an attribute error. My script below loops through the first two pages with no problem, but when it hits the third page it stops.
Here is the traceback
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/beautifulsoup4-4.6.0-py3.6.egg/bs4/element.py", line 737, in __getattr__ AttributeError: 'NavigableString' object has no attribute 'text'
How can I fix this? Here is my script:
from bs4 import BeautifulSoup as soup
import requests
import json
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
#html parsing
page_soup = soup(response.content, "html5lib")
#grabs info for each textbook
containers = page_soup.findAll("div",{"class":"LongDescription"})
author = page_soup.select("p")
about = page_soup.find("div",{"id":"AboutBook"})
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find("div",{"class":"twothird"}).h1.text
item['author'] = author[3].get_text(separator=', ')
if item['author'] == " ":
item['author'] = "University of Minnesota Libraries Publishing"
item['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
if not container.find(string="Publisher: "):
item['publisher_url'] = item['publisher'] = ""
else:
item['publisher'] = container.find(text="Publisher: ").nextSibling.text
item['publisher_url'] = container.find(text="Publisher: ").nextSibling['href']
item['source'] = "Open Textbook Library"
if not about.h2.nextSibling.nextSibling.nextSibling:
item['description'] = ""
else:
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text
item['base_url'] = "https://open.umn.edu/opentextbooks/"
if container.find("p",{"class":"Badge-Condition"}).a:
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license'] = ''
if container.find("p",{"class":"Badge-Condition"}).a:
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license_url'] = ''
if container.find("div",{"class":"twothird"}).p:
item['review'] = container.find("div",{"class":"twothird"}).p.text
else:
item['review'] = ''
if item['review'].startswith('('):
item['review'] = item['review'].replace('(', '')
if item['review'].endswith(' reviews)'):
item['review'] = item['review'].replace(' reviews)', '')
if item['review'] > str(0):
item['review'] = "Reviewed Resource"
else:
item['review'] = ''
item['image_url'] = "https://open.umn.edu/opentextbooks/" + container.img["src"]
data.append(item) # add the item to the list
with open("./json/otl-1.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)

I wouldn't recommend parsing the description with item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text, that's too much specific. I came up with this code:
from bs4 import BeautifulSoup as soup
import requests
import json
from pprint import pprint
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
page_soup = soup(response.content, "lxml")
data = {}
title, author, description = page_soup.select('h1')[0].text, \
page_soup.select('h1 ~ p')[3].get_text(', '), \
'\n'.join(p.text.strip() for p in page_soup.select('div#AboutBook > p') if p.text.strip())
data['type'] = "Textbook"
data['title'] = title
data['author'] = author if author.strip() else "University of Minnesota Libraries Publishing"
data['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
data['source'] = "Open Textbook Library"
data['description'] = description
pprint(data)
# with open("./json/otl-1.json", "w") as writeJSON:
# json.dump(data, writeJSON, ensure_ascii=False)
Prints:
{'author': 'University of Minnesota Libraries Publishing',
'description': 'This book is intended for an undergraduate or MBA level '
'Financial Accounting course. It covers the standard topics in '
'a standard sequence, utilizing the Socratic method of asking '
'and answering questions.',
'link': 'https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=4',
'source': 'Open Textbook Library',
'title': 'Financial Accounting',
'type': 'Textbook'}
...and so on (for each book)

Wherever you are getting the AttributeError you can use the following code:
Try:
your code here
except AttributeError:
pass or other codes

Related

How to combine lists in csv output

I am new to BS4 and python.
For a project i am trying to get some real estate data.
i made my code so that is get two lists.
my challege is to combine te data in the output.
can any one help me please?
ty
ps: any tips on more efficiƫnt code are welkom.
from selenium import webdriver
from bs4 import BeautifulSoup
#open('output.csv', 'w').close()
import re
import time
import requests
from itertools import chain
from pandas import DataFrame
import csv
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(5)
#input('Press Enter after bypassing Captcha')
#print(url)
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
#print(inside)
for huis in info:
#locatie = huis.find('div')
#locatie = ' '.join(locatie.get_text(separator='\r\n', strip=True).split()[:-1])
#locatie = huis.find('h2')
#locatie = ' '.join(locatie.get_text(separator='\r\n', strip=True).split())
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
#sep by newline, strip whitespace, then split to get the last 3 elements to cut out, then rejoin
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
l1 = ('{},{},{},{}'.format(street, address, price, pricetag))
#print('{},{},{},{}'.format(street, address, price, pricetag))
out = open('output.csv', 'w')
saveFile = open('output.csv', 'a')
saveFile.write(street + "," + address + "," + price + "," + pricetag + '\n')
#print (list1)
for items in inside:
href = items.get('href')
#print (href)
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
#print (tr)
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
#print('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
saveFile = open('output.csv', 'a')
saveFile.write(soort+ "," + bouwjaar+ "," + woonoppervlakte + "," + inhoud + "," + perceel + '\n')
saveFile.close()
#output = list(chain(list1,list2))
#print (output)
page += 1
#output = list(chain(list1,list2))
#print (output)
#kenmerken = inside.find_all ('a', {'class': 'href'})
#print (href)
#print (details)
#print('{},{},{},{}'.format(street, address, price, pricetag))
#saveFile = open('jaap.csv', 'a')
#saveFile.write(street + "," + address + "," + price + "," + pricetag + '\n')
#saveFile.close()
jaap_spider(1)
Right now your code doesn't actually seem to make two lists. But asuming that you would make a list of lists for l1 out of for huis in info: and a list of lists l2 from for items in inside:, what you could do to combine two lists of lists is: outputlist = [a + b for a, b in zip(l1, l2)].
I incorporated that, plus a conversion to a Pandas DataFrame and an export to csv in the adapted code below:
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
#browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser = webdriver.Chrome(r'C:\Users\NLNIEH\.spyder-py3\chromedriver.exe')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(5)
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
jaap_spider(1)
You can use csv for writing list in csv file.
import csv
def write_list_in_file(filepath, output):
with open(filepath, 'a') as outtsv:
tuple_writer = csv.writer(outtsv, delimiter=',')
tuple_writer.writerow(output)

Newbie: Python "AttributeError: 'NoneType' object has no attribute 'text' " when scraping Tripadvisor Reviews

I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
It worked (though, removing the attribute "language") for one link but it doesn't work for any more link (for example.)
I'm receiving the error:
> Traceback (most recent call last):
> File "<pyshell#27>", line 4, in <module>
> items = scrape(url)
> File "<pyshell#12>", line 11, in scrape
> items = parse(session, url + '?filterLang=' + lang)
> File "<pyshell#15>", line 12, in parse
> num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
> AttributeError: 'NoneType' object has no attribute 'text'
I'm attaching the code here with the changes I made in case someone can help me.
Thank you so much!
Silvia
--
I substituted the original:
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
with
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
With the original code I get the error
ValueError: invalid literal for int() with base 10: '5.695'
(where 5.695 is the number of reviews in the page)
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', id='HEADING').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
lang = 'it'
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
I realized the problem lies in the source code.
hotel_name = soup.find('h1', id='HEADING').text
found no target id in the source website. I substituted it with:
hotel_name = soup.find('h1', class_='heading').text
I hope it can help others!

Skip Over Item if element doesn't exist on page when looping through multiple pages - BeautifulSoup and Python

I have a script that loops through multiple pages of a website and I want to skip over or add a blank space for the item that might not be on certain pages. For example, there are some pages that do not contain a license. When I run into one of those pages I get an attribute error. My script below loops through the first two pages with no problem, but when it hits the third page it stops. How can I fix this? Here is my script:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import json
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 50
for i in range(4, n+1):
response = urlopen(base_url + "BookDetail.aspx?bookId=" + str(i))
page_html = response.read()
response.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs info for each textbook
containers = page_soup.findAll("div",{"class":"LongDescription"})
author = page_soup.select("p")
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find("div",{"class":"twothird"}).h1.text
item['author'] = author[3].get_text(separator=', ')
if item['author'] == " ":
item['author'] = "University of Minnesota Libraries Publishing"
item['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
item['source'] = "Open Textbook Library"
item['base_url'] = "https://open.umn.edu/opentextbooks/"
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if item['license'] != container.find("p",{"class":"Badge-Condition"}).a.text:
item['license'] = ""
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
data.append(item) # add the item to the list
with open("./json/noSubject/otl-loop.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
I figured it out. My main issue was with item['license'] Here is my fix:
if container.find("p",{"class":"Badge-Condition"}).a:
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license'] = ''
if container.find("p",{"class":"Badge-Condition"}).a:
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license_url'] = ''

Python: save same-title files in the folder

Code:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import json
from os import listdir
res = requests.get('http://www.abcde.com/frontend/SearchParts')
soup = BeautifulSoup(res.text,"lxml")
href = [ a["href"] for a in soup.findAll("a", {"id" : re.compile("parts_img.*")})]
b1 =[]
for url in href:
b1.append("http://www.abcde.com"+url)
#print (b1)
b=[]
for i in range(len(b1)):
res2 = requests.get(b1[i]).text
soup2 = BeautifulSoup(res2,"lxml")
url_n=soup2.find('',rel = 'next')['href']
url_n=("http://www.abcde.com"+url_n)
#print(url_n)
b.append(b1[i])
b.append(url_n)
while True:
res3=requests.get(url_n).text
soup3 = BeautifulSoup(res3,"lxml")
try:
url_n=soup3.find('',rel = 'next')['href']
except TypeError:
break
if url_n:
url_n=("http://www.abcde.com"+url_n)
#print(url_n)
b.append(url_n)
all=[]
for url in b:
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".article-title"):
all.append(urljoin('http://www.abcde.com',item['href']))
for urls in all:
re=requests.get(urls)
soup=BeautifulSoup(re.text.encode('utf-8'), "html.parser")
title_tag = soup.select_one('.page_article_title')
list=[]
for tag in soup.select('.page_article_content'):
list.append(tag.text)
list=([c.replace('\n', '') for c in list])
list=([c.replace('\r', '') for c in list])
list=([c.replace('\t', '') for c in list])
list=([c.replace(u'\xa0', u' ') for c in list])
list= (', '.join(list))
fruit_tag = soup.select_one('.authorlink')
fruit_final=None
if fruit_tag:
fruit_final= fruit_tag.text
else:
fruit_final= fruit_tag
keys=soup.findAll('div', style="font-size:1.2em;")
keys_final=None
list2=[]
if keys:
for key in keys:
list2.append(key.text)
list2=([c.replace('\n', '') for c in list2])
list2=([c.replace(' ', '') for c in list2])
list2= (', '.join(list2))
key_final=list2
else:
key_final=keys
if key_final==[]:
key_final=None
##################edit part####################################
data={
"Title" : title_tag.text,
"Registration": fruit_final,
"Keywords": key_final,
"Article": list
}
save_path= "C:/json/"
files=listdir(save_path)
file_name = save_path+'%s.json' % title_tag.text
with open(file_name, 'w',encoding='UTF-8') as f:
if file_name not in files:
file_d = json.dumps(data,ensure_ascii=False)
f.write(file_d)
else:
file_name = save_path +'%s_1.json' % title_tag.text
file_d = json.dumps(data,ensure_ascii=False)
f.write(file_d)
I scraped a web page and extract every article's title as title_tag.text. I found that some articles have same titles but different urls/contents, so I still need to save them in my directory. Now I know how to check it if two titles are the same, I can just name one as original and another with original_1. But what if I need to save 4 files which have same titles? How to do it in this case? Thanks in advance!

Amazon best seller scraping

I have developed a script to scrape URL, title and other information
from Amazon best seller categories. The script below is working fine, but it's very slow as Amazon has multiple sub to sub categories so in order to traverse all of the sub categories, it takes so much time,
Is there any thing I can do to make it work fast? I'm using Python 2.7 64 bit
Thanks
import requests
import json
import threading
from bs4 import BeautifulSoup
import re
def GetSoupResponseFromURL(url):
response = requests.get(url, timeout=180)
soup = BeautifulSoup(response.content, 'html.parser')
return soup;
def GetSubCategories(categoryURL):
subCategory = []
soup = GetSoupResponseFromURL(categoryURL)
try:
ul = soup.find('span', {'class':'zg_selected'}).parent.parent.find('ul')
if ul is not None:
subCategories = ul.find_all('a')
for category in subCategories:
catTitle = category.text
url = category.get('href')
lists = soup.find('ul', {'id':'zg_browseRoot'}).find_all('ul')
del lists[-1]
global titleList
titleList = []
for ulist in lists:
text = re.sub(r'[^\x00-\x7F]+','', ulist.find('li').text)
titleList.append(text.strip(' \t\n\r'))
fullTitle = (' > '.join(map(str, titleList)) + ' > ' + catTitle)
soup = GetSoupResponseFromURL(url)
title = soup.find('span', {'class':'category'})
if title is not None:
title = title.text
else:
title = soup.find('div', {'id':'zg_rssLinks'}).find_all('a')[-1].text
title = title[title.index('>') + 2:]
print('Complete Title: ' + fullTitle)
print('Title: ' + title)
print('URL: ' + url)
print('-----------------------------------')
data = {}
data['completeTitle'] = fullTitle
data['title'] = title
data['url'] = url
data['subCategory'] = GetSubCategories(url)
subCategory.append(data)
except Exception, e:
pass
return subCategory
class myThread (threading.Thread):
def __init__(self, threadID, url):
threading.Thread.__init__(self)
self.threadID = threadID
self.url = url
def run(self):
print "Starting Thread " + str(self.threadID)
array = []
array = GetSubCategories(self.url)
with open('Category ' + str(self.threadID) + '.json', 'w') as outfile:
json.dump(array, outfile)
print "Exiting Thread " + str(self.threadID)
mainURL = 'https://www.amazon.fr/gp/bestsellers/ref=zg_bs_unv_petsupplies_0_2036875031_3'
soup = GetSoupResponseFromURL(mainURL)
mainCategories = soup.find('ul', {'id':'zg_browseRoot'}).find_all('a')
print mainCategories
counter = 1
for category in mainCategories[1:2]:
thread = myThread(counter, category.get('href'))
thread.start()
counter+=1

Categories