The parser does not write to json. Writes only {}

The parser does not write to json. Writes only {} - python

I created a market parser for my own purposes, it works well overall!
Initially faced with a recording problem, gave a decode error. Now he did something and it disappeared, but now he does not want to parse the data into json, but simply writes 2 characters - {}
Here is main.py:
import json
import requests
from bs4 import BeautifulSoup
def get_first_news():
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36"
}
url = "https://funpay.ru/lots/700/"
r = requests.get(url=url, headers=headers)
soup = BeautifulSoup(r.text, "lxml")
articles_cards = soup.find_all("a", class_="tc-desc-text")
news_dict = {}
for article in articles_cards:
article_title = article.find("div", class_="tc-desc-text").text.strip()
article_desc = article.find("div", class_="tc-price").text.strip()
article_url = f'https://funpay.ru/lots/700/{article.get("href")}'
article_id = article_url.split("=")[-1]
# print(f"{article_title} | {article_url} | {article_date_timestamp}")
news_dict[article_id] = {
"article_title": article_title,
"article_url": article_url,
"article_desc": article_desc
}
with open("news_dict.json", "w") as file:
json.dump(news_dict, file, indent=4, ensure_ascii=False)
def main():
get_first_news()
if __name__ == '__main__':
main()
Here is test.py
# url = "https://www.securitylab.ru/news/520908.php"
#
# article_id = url.split("/")[-1]
# article_id = article_id[:-4]
# print(article_id)
import json
with open("news_dict.json") as file:
news_dict = json.load(file)
search_id = "520908123"
if search_id in news_dict:
print("Новость уже есть в словаре, пропускаем итерацию")
else:
print("Свежая новость, добавляем в словарь")
Here is news_dict.json:
{}

In article_cards = soup.find_all ("a", class _ = "tc-desc-text"),
we replace "a" with "div"
Here's what should come out:
article_cards = soup.find_all ("div", class _ = "tc-desc-text")

Related

Problems with getting data from a page using python, beautiful soup

I am trying to explore the web scraping in python.Currently working with beautiful soup.I was trying to get names of the festivals from this site : https://www.skiddle.com/festivals .Everything was going pretty fine, except 1 page, this one: https://www.skiddle.com/festivals/front-end-data-test/. It says 'NoneType' object has no attribute 'find' any way i can get data from there?
Here is the code
import requests
from bs4 import BeautifulSoup
import lxml
import json
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64"
}
#collect all fests URLs
fests_urls_list = []
#for i in range(0, 120, 24):
for i in range(0, 24, 24):
url = f"https://www.skiddle.com/festivals/search/?ajaxing=1&sort=0&fest_name=&from_date=15%20Aug%202022&to_date=&maxprice=500&o={i}&bannertitle=August"
req = requests.get(url=url, headers=headers)
json_data = json.loads(req.text)
html_response = json_data["html"]
with open(f"data/index_{i}.html", "w", encoding="utf-8") as file:
file.write(html_response)
with open(f"data/index_{i}.html", "r", encoding="utf-8") as file:
src = file.read()
soup = BeautifulSoup(src, "lxml")
cards = soup.find_all("a", class_="card-details-link")
for item in cards:
fest_url = "https://www.skiddle.com" + item.get("href")
fests_urls_list.append(fest_url)
#collect fest info
for url in fests_urls_list:
req = requests.get(url=url, headers=headers)
try:
soup = BeautifulSoup(req.text, "lxml")
fest_name = soup.find("div", class_="MuiContainer-root MuiContainer-maxWidthFalse css-1krljt2").find("h1").text.strip()
fest_data = soup.find("div", class_="MuiGrid-root MuiGrid-item MuiGrid-grid-xs-11 css-twt0ol").text.strip()
print(fest_data)
except Exception as ex :
print(ex)
print("This was not supposed to happen")

Parsing in python for all pages of the site section

I'm making a python parser for the site: https://www.kinopoisk.ru/lists/series-top250/
import requests
from bs4 import BeautifulSoup
import csv
CSV = 'genres.csv'
URL = 'https://www.kinopoisk.ru/lists/series-top250/?page=1&tab=all'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0', 'accept': '*/*'}
def get_html(url, params = None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_='selection-film-item-meta selection-film-item-meta_theme_desktop')
genres = []
for item in items:
additional = item.find_all('span', {'class':'selection-film-item-meta__meta-additional-item'})
genres.append(
{
'genre': additional[1].get_text(strip = True)
}
)
return genres
def save_genres(items, path):
with open(path, 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(['genre'])
for item in items:
writer.writerow([item['genre']])
def parser():
html = get_html(URL)
if html.status_code == 200:
genres = []
for page in range(1, 6):
html = get_html(URL, params = {'page': page})
genres.extend(get_content(html.text))
save_genres(genres, CSV)
pass
else:
print('Non_available')
parser()
The section of site has 5 pages of rating:
https://www.kinopoisk.ru/lists/series-top250/?page=1&tab=all
...
https://www.kinopoisk.ru/lists/series-top250/?page=5&tab=all
I made a for_loop for parsing from all pages with changing number of page
for page in range(1, 6):
html = get_html(URL, params = {'page': page})
genres.extend(get_content(html.text))
but parsing occurs only on 1 page. Please tell me, what am I doing wrong?
And when I save the result in CSV, each line can contain more than 1 word (genre designation), I don’t know how to make sure that there is only 1 value on 1 line for aggregated analytics
Thank you!

Remove the parameters from the URL (the part after ? included):
import requests
from bs4 import BeautifulSoup
import csv
CSV = "genres.csv"
URL = "https://www.kinopoisk.ru/lists/series-top250/"
HEADERS = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0",
"accept": "*/*",
}
PARAMS = {"page": 1, "tab": "all"}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all(
"div",
class_="selection-film-item-meta selection-film-item-meta_theme_desktop",
)
genres = []
for item in items:
additional = item.find_all(
"span", {"class": "selection-film-item-meta__meta-additional-item"}
)
genres.append({"genre": additional[1].get_text(strip=True)})
return genres
def save_genres(items, path):
with open(path, "w", newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["genre"])
for item in items:
writer.writerow([item["genre"]])
def parser():
genres = []
for page in range(1, 6):
print("Parsing page {}...".format(page))
PARAMS["page"] = page
html = get_html(URL, PARAMS)
if html.status_code == 200:
genres.extend(get_content(html.text))
else:
print("Non_available")
save_genres(genres, CSV)
parser()
Creates genres.csv:

Scrape and save the data in to csv in beautiful soup

Below is the url to scrape
https://www.agtta.co.in/individuals.php
I need to extract Name, Mobile number, and Email
I need to save into csv after that
I am able scrape the data full data with below code
Extract using user agent below is the code
from bs4 import BeautifulSoup
import urllib.request
urls=['https://www.agtta.co.in/individuals.php']
for url in urls:
req = urllib.request.Request(
url,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
resp= urllib.request.urlopen(req)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
scrape_data = soup.find('section', class_='b-branches')
to_list = scrape_data .find_all_next(string=True)
I tried with
for biz in results:
#print(biz)
title = biz.findAll('h3', {'class': 'b-branches__title ui-title-inner ui-title-inner_lg'})
print (title)
I m getting [<h3 class="b-branches__title ui-title-inner ui-title-inner_lg">SHRI RAMESHBHAI P. SAKARIYA</h3>]
Tag is coming while extracting How to remove the tag
My expected out
Name, Mobilenumber, Email
A, 333, mm#gmail.com`

from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
urls=['https://www.agtta.co.in/individuals.php']
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
for url in urls:
req = urllib.request.Request(url, headers=headers)
resp= urllib.request.urlopen(req)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
result = []
for individual in soup.findAll("section", {"class": "b-branches"}):
name = individual.h3.text
phone_data = individual.find('p')
phone = phone_data.text.replace("Mobile No","").strip() if phone_data else ""
email_data = individual.select('div:contains("Email")')
email = email_data[0].text.replace("Email","").strip() if email_data else ""
result.append({"Name":name, "Phone": phone, "Email":email})
output = pd.DataFrame(result)
output.to_csv("Details.csv",index = False)

Here is the full code to do it:
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get('https://www.agtta.co.in/individuals.php',headers = headers).text
soup = BeautifulSoup(r,'html5lib')
sections = soup.find_all('section',class_ = "b-branches")
names = []
phone_numbers = []
emails = []
for section in sections:
name = section.h3.text
names.append(name)
phone_number = section.p.text
phone_number = phone_number.split('Mobile No ')[1]
phone_numbers.append(phone_number)
try:
email = section.find_all('div')[3].text
email = email.split('Email ')[1]
emails.append(email)
except:
emails.append(None)
details_dict = {"Names":names,
"Phone Numbers":phone_numbers,
"Emails":emails}
df = pd.DataFrame(details_dict)
df.to_csv("Details.csv",index = False)
Output:
Hope that this helps!

Unable to scrape the name from the inner page of each result using requests

I've created a script in python making use of post http requests to get the search results from a webpage. To populate the results, it is necessary to click on the fields sequentially shown here. Now a new page will be there and this is how to populate the result.
There are ten results in the first page and the following script can parse the results flawlessly.
What I wish to do now is use the results to reach their inner page in order to parse Sole Proprietorship Name (English) from there.
website address
I've tried so far with:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
payload = {
'QueryString': '0',
'SourceAppCode': 'cambodia-br-soleproprietorships',
'OriginalVersionIdentifier': '',
'_CBASYNCUPDATE_': 'true',
'_CBHTMLFRAG_': 'true',
'_CBNAME_': 'buttonPush'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = res.url.split("&")[0].replace("view.", "update.")
node = re.findall(r"nodeW\d.+?-Advanced",res.text)[0].strip()
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload[node] = 'N'
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[2]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.content, 'html.parser')
for item in soup.find_all("span", class_="appReceiveFocus")[3:]:
print(item.text)
How can I parse the Name (English) from each of the results inner page using requests?

This is one of the ways you can parse the name from the site's inner page and then email address from the address tab. I added this function .get_email() only because I wanted to let you know as to how you can parse content from different tabs.
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
result_url = "https://www.businessregistration.moc.gov.kh/cambodia-master/viewInstance/update.html?id={}"
base_url = "https://www.businessregistration.moc.gov.kh/cambodia-br-soleproprietorships/viewInstance/update.html?id={}"
def get_names(s):
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = result_url.format(res.url.split("id=")[1])
soup = BeautifulSoup(res.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['QueryString'] = 'a'
payload['SourceAppCode'] = 'cambodia-br-soleproprietorships'
payload['_CBNAME_'] = 'buttonPush'
payload['_CBHTMLFRAG_'] = 'true'
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[-1]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
payload.pop('_CBHTMLFRAGNODEID_')
payload.pop('_CBHTMLFRAG_')
payload.pop('_CBHTMLFRAGID_')
for item in soup.select("a[class*='ItemBox-resultLeft-viewMenu']"):
payload['_CBNAME_'] = 'invokeMenuCb'
payload['_CBVALUE_'] = ''
payload['_CBNODE_'] = item['id'].replace('node','')
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
address_url = base_url.format(res.url.split("id=")[1])
node_id = re.findall(r"taba(.*)_",soup.select_one("a[aria-label='Addresses']")['id'])[0]
payload['_CBNODE_'] = node_id
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNAME_'] = 'tabSelect'
payload['_CBVALUE_'] = '1'
eng_name = soup.select_one(".appCompanyName + .appAttrValue").get_text()
yield from get_email(s,eng_name,address_url,payload)
def get_email(s,eng_name,url,payload):
res = s.post(url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
email = soup.select_one(".EntityEmailAddresses:contains('Email') .appAttrValue").get_text()
yield eng_name,email
if __name__ == '__main__':
with requests.Session() as s:
for item in get_names(s):
print(item)
Output are like:
('AMY GEMS', 'amy.n.company#gmail.com')
('AHARATHAN LIN LIANJIN FOOD FLAVOR', 'skykoko344#gmail.com')
('AMETHYST DIAMOND KTV', 'twobrotherktv#gmail.com')

To get the Name (English) you can simply replace print(item.text) with print(item.text.split('/')[1].split('(')[0].strip()) which prints AMY GEMS

Why does my web scraper only work half the time?

My goal is to get the product name and price of all Amazon pages detected in any website that I feed to my program.
My input is a text file containing five websites. In each of these websites, a total of five to fifteen amazon links are to be found.
My code is this:
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
from urllib2 import Request, urlopen, HTTPError, URLError
def isdead(url):
user_agent = 'Mozilla/20.0.1 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent':user_agent }
req = Request(url, headers = headers)
sleep(10)
try:
page_open = urlopen(req)
except HTTPError, e:
return e.code #404 if link is broken
except URLError, e:
return e.reason
else:
return False
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip()
#re.... is a list. if empty, available. if not, unavailable.
#return re.findall(r'Available from',AVAILABILITY[:30], re.IGNORECASE)
if len(re.findall(r'unavailable',AVAILABILITY[:30],re.IGNORECASE)) == 1:
return "unavailable"
else:
return "available"
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
all_links = [x for x in all_links if "amazon.com/gp/prime" not in x]
all_links = [y for y in all_links if "amazon.com/product-reviews" not in y]
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
if isdead(i) == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == "unavailable":
price = "UNAVAILABLE"
else:
if (len(soup.select("#priceblock_ourprice")) == 0) and (len(soup.select("#priceblock_saleprice")) == 0):
price = soup.select("#a-offscreen")
elif len(soup.select("#priceblock_ourprice")) == 0:
price = soup.select("#priceblock_saleprice")
else:
price = soup.select("#priceblock_ourprice")
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
Whenever it works fine, the output looks something like this (please don't judge the PRICE output, I have spent so much time trying to fix that but nothing works because I can't turn it into a string and get_text() doesn't work. This project is just for personal use so it's not that important, but if you have suggestions, I'm very receptive to those.):
LINK:
https://www.amazon.com/dp/B007Y6LLTM/ref=as_li_ss_tl?ie=UTF8&linkCode=ll1&tag=lunagtkf1-20&linkId=ee8c5299508af57c815ea6577ede4244
TITLE:Moen 7594ESRS Arbor Motionsense Two-Sensor Touchless One-Handle Pulldown Kitchen Faucet Featuring Power Clean, Spot Resist Stainless
PRICE:[<span class="a-size-medium a-color-price priceBlockBuyingPriceString" id="priceblock_ourprice">$359.99</span>]
/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/
... and so on.
The error looks like this:
Traceback (most recent call last):
File "name.py", line 75, in <module>
title = soup.select("#productTitle")[0].get_text().strip()
IndexError: list index out of range
It's so weird because there's a text file that's fed so many times and sometimes, all sites are scraped well, but sometimes, the error appears at the 10th Amazon product, sometimes, the error appears at the 1st product...
I'm suspecting it's a bot detection problem, but I have a header. What's the problem?

Your code is too messy. I've organized it for you, please check out if it works.
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
if response.status_code == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
html = response.text
doc = SimplifiedDoc(html)
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)

You should learn more:) and give you an example of using the framework.
Here are more examples of simplified_scrapy here
If you need any help, please let me know.
from simplified_scrapy.spider import Spider, SimplifiedDoc
class MySpider(Spider):
name = 'amazon-product'
# allowed_domains = ['example.com']
start_urls = []
refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
filepath='' # Your file path
if filepath:
with open(filepath) as f:
start_urls = [line.rstrip('\n') for line in f]
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
amazon_links=None
data = None
if url['url'].find('https://www.amazon.com')>=0 or url['url'].find('https://amzn.to')>=0:
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
data = [{"title":title,'price':price}] # Get target data
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
else:
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
return {"Urls": amazon_links, "Data": data} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

The parser does not write to json. Writes only {} - python

In article_cards = soup.find_all ("a", class _ = "tc-desc-text"), we replace "a" with "div" Here's what should come out: article_cards = soup.find_all ("div", class _ = "tc-desc-text")

Related

Problems with getting data from a page using python, beautiful soup

Parsing in python for all pages of the site section

Scrape and save the data in to csv in beautiful soup

Unable to scrape the name from the inner page of each result using requests

Why does my web scraper only work half the time?

Categories

Resources