Problem/Error with scraping in a pandas data frame with beautifulsoup - python

I'm working on this csv (https://www.kaggle.com/jtrofe/beer-recipes) and I want to scrape every URL in data frame, but I can't because I have a problem/error, I'm not able to scrape all URL, if I try with 1 URL, it's ok and go, but with the function there is a problem... can someone help me?
This is my code:
import requests
from bs4 import BeautifulSoup
from time import sleep
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
base = 'https://www.brewersfriend.com'
links = [f'{base}{r}' for r in df['URL']]
while True:
try:
r = requests.get(links, headers=headers, stream=False, timeout=8).text
break
except:
if r.status_code == 404:
print("Client error")
r.raise_for_status()
sleep(1)
soup = BeautifulSoup(r, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
DEFAULT_VALUE = 'NaN'
if rating is None:
rating = DEFAULT_VALUE
print(rating.text)
I'm already know that in some page there isn't a rating and so I create the DEFAULT_VALURE with Not a Number, but maybe is an error too.
Before this code there is the data frame, but I don't put it too.
I hope someone can help me!
Thanks so much

All kinds of messy things here. I won;t go over all of it, but one thing I see is you are trying to print (rating.text). If youre rating is 'NaN', one error is that you can't do rating.text
This is not how I would write this up, but going off your initial coding:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
df = pd.read_csv('C:/recipeData/recipeData.csv', encoding = 'ISO-8859-1')
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
base = 'https://www.brewersfriend.com'
links = [f'{base}{r}' for r in df['URL']]
for link in links:
try:
r = requests.get(link, headers=headers, stream=False, timeout=8)
if r.status_code == 404:
print("Client error")
r.raise_for_status()
continue
else:
r = r.text
except:
continue
soup = BeautifulSoup(r, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'}).text
DEFAULT_VALUE = 'NaN'
if rating is None:
rating = DEFAULT_VALUE
print('%s: %s' %(link,rating))

Here is a way to do entire process
import requests, re
import pandas as pd
from bs4 import BeautifulSoup as bs
p = re.compile(r'dataviewToken":"(.*?)"')
p1 = re.compile(r'"rowCount":(\d+)')
results = []
i = 0
with requests.Session() as s:
r = s.get('https://www.kaggle.com/jtrofe/beer-recipes')
token = p.findall(r.text)[0]
rows = int(p1.findall(r.text)[0])
data = {"jwe":{"encryptedToken": token},"source":{"type":3,"dataset":{"url":"jtrofe/beer-recipes","tableType":1,"csv":{"fileName":"recipeData.csv","delimiter":",","headerRows":1}}},"select":["BeerID","Name","URL","Style","StyleID","Size(L)","OG","FG","ABV","IBU","Color","BoilSize","BoilTime","BoilGravity","Efficiency","MashThickness","SugarScale","BrewMethod","PitchRate","PrimaryTemp"],"skip":0,"take": rows}
base = 'https://www.brewersfriend.com'
r = s.post('https://www.kaggleusercontent.com/services/datasets/kaggle.dataview.v1.DataViewer/GetDataView', json = data).json()
names, links = zip(*[(row['text'][1], base + row['text'][2]) for row in r['dataView']['rows']])
for link in links:
r = s.get(link, headers = {'User-Agent' : 'Mozilla/5.0'})
if r.status_code == 403:
rating = 'N/A'
else:
soup = bs(r.content, 'lxml')
rating = soup.select_one('[itemprop=ratingValue]')
if rating is None:
rating = 'N/A'
else:
rating = rating.text
row = [names[i], rating]
results.append(row)
i+=1
df = pd.DataFrame(results, columns = ['Name', 'Rating'])
print(df.head())
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )

Related

Trouble using pandas read_html() : ValueError

from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
url = "https://finance.naver.com/item/sise_day.nhn?code=068270&page=1"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
res = requests.get(url, verify=True, headers=headers)
with urlopen(url) as doc:
html = BeautifulSoup(res.text, 'lxml')
pgrr = html.find('td', class_='pgRR')
s = str(pgrr.a['href']).split('=')
last_page = s[-1]
df = pd.DataFrame()
sise_url = 'http://finance.naver.com/item/sise_day.nhn?code=068270'
for page in range(1, int(last_page)+1):
page_url = '{}&page={}'.format(sise_url, page)
df = df.append(pd.read_html(page_url, encoding='euc-kr', header='0')[0])
df = df.dropna() # 값이 빠진 행을 제거한다.
print(df)
I'm having this Value error while crawling the Daily stock data in Naver Finance.
I have no trouble getting the url but if i use the read_html() i have Value Error:Table not found issue from the line df = df.append(pd.read_html(page_url, encoding='euc-kr', header='0')[0]). Pls give some advice.
I don't read Korean... however pd.read_html() was getting an error page. Resolved this by requests.get() with headers. Then pass res.text to read_html()
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import pandas as pd
url = "https://finance.naver.com/item/sise_day.nhn?code=068270&page=1"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
res = requests.get(url, verify=True, headers=headers)
with urlopen(url) as doc:
html = BeautifulSoup(res.text, 'lxml')
pgrr = html.find('td', class_='pgRR')
s = str(pgrr.a['href']).split('=')
last_page = s[-1]
df = pd.DataFrame()
sise_url = 'http://finance.naver.com/item/sise_day.nhn?code=068270'
for page in range(1, int(last_page)+1):
page_url = '{}&page={}'.format(sise_url, page)
res = requests.get(page_url, verify=True, headers=headers)
df = df.append(pd.read_html(res.text, encoding='euc-kr')[0])

Limited number of scraped data?

I am scraping a website and everything seems work fine from today's news until news published in 2015/2016. After these years, I am not able to scrape news.
Could you please tell me if anything has changed?
I should get 672 pages getting titles and snippets from this page:
https://catania.liveuniversity.it/attualita/
but I have got approx. 158.
The code that I am using is:
import bs4, requests
import pandas as pd
import re
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page_num=1
website="https://catania.liveuniversity.it/attualita/"
while True:
r = requests.get(website, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html')
title=soup.find_all('h2')
date=soup.find_all('span', attrs={'class':'updated'})
if soup.find_all('a', attrs={'class':'page-numbers'}):
website = f"https://catania.liveuniversity.it/attualita/page/{page_num}"
page_num +=1
print(page_num)
else:
break
df = pd.DataFrame(list(zip(dates, titles)),
columns =['Date', 'Titles'])
I think there has been some changes in tags (for example in next page button, or just in the date/title tag).
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
def main(req, num):
r = req.get(
"https://catania.liveuniversity.it/attualita/page/{}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
try:
data = [(x.select_one("span.updated").text, x.findAll("a")[1].text, x.select_one("div.entry-content").get_text(strip=True)) for x in soup.select(
"div.col-lg-8.col-md-8.col-sm-8")]
return data
except AttributeError:
print(r.url)
return False
with ThreadPoolExecutor(max_workers=30) as executor:
with requests.Session() as req:
fs = [executor.submit(main, req, num) for num in range(1, 673)]
allin = []
for f in fs:
f = f.result()
if f:
allin.extend(f)
df = pd.DataFrame.from_records(
allin, columns=["Date", "Title", "Content"])
print(df)
df.to_csv("result.csv", index=False)

Exception has occurred: TypeError in Python

I am very new to coding so I am sorry this is a dumb question. I keep getting an error every time I try to run this code for a Python scraper. Any help would be great.
Exception has occurred: TypeError
'module' object is not callable
File "C:\Users\quawee\OneDrive\seaporn.org-scraper\seaporn.org-scraper.py", line 33, in <module>
articles = requests(x)
from this code....
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
articlelist = []
def request(x):
url = f'https://www.seaporn.org/category/hevc/page/{x}/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, features='lxml')
return soup.find_all('article', class_ = 'post-summary')
def parse(articles):
for item in articles:
link = item.find({'a': 'entry-link'})
article = {
'link': link['href']
}
articlelist.append(article)
def output():
df = pd.DataFrame(articlelist)
df.to_excel('articlelist.xlsx', index=False)
print('Saved to xlsx.')
x = 5000
while True:
print(f'Page {x}')
articles = requests(x)
x = x + 1
time.sleep(3)
if len(articles) != 0:
parse(articles)
else:
break
print('Completed, total articles is', len(articlelist))
output()
The name of your defined function is request(x). You are calling requests(x) inside the while loop.
This should work, I just corrected the spelling:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
articlelist = []
def request(x):
url = f'https://www.seaporn.org/category/hevc/page/{x}/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, features='lxml')
return soup.find_all('article', class_ = 'post-summary')
def parse(articles):
for item in articles:
link = item.find({'a': 'entry-link'})
article = {
'link': link['href']
}
articlelist.append(article)
def output():
df = pd.DataFrame(articlelist)
df.to_excel('articlelist.xlsx', index=False)
print('Saved to xlsx.')
x = 5000
while True:
print(f'Page {x}')
articles = request(x)
x = x + 1
time.sleep(3)
if len(articles) != 0:
parse(articles)
else:
break
print('Completed, total articles is', len(articlelist))
output()

Multiple Page BeautifulSoup Script only Pulling first value

New to screen scraping here and this is my first time posting on stackoverflow. Aplogies in advance for any formatting errors in this post. Attempting to extract data from multiple pages with URL:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
For instance, page 1 is:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-1
Page 2:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-2
and so on...
My script is running without errors. However, my Pandas exported csv only contains 1 row with the first extracted value. At the time of this posting, the first value is:
14.01 Acres   Vestaburg, Montcalm County, MI$275,000
My intent is to create a spreadsheet with hundreds of rows that pull the property description from the URLs.
Here is my code:
import requests
from requests import get
from bs4 import BeautifulSoup
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
for page in range(1,900):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
else:
break
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))
import pandas as pd
df = pd.DataFrame({'description': [desc]})
df.to_csv('test4.csv', encoding = 'utf-8')
I suspect the problem is with the line reading desc = container.getText(strip=True) and have tried changing the line but keep getting errors when running.
Any help is appreciated.
I believe the mistake is in the line:
desc = container.getText(strip=True)
Every time it loops, the value in desc is replaced, not added on. To add items into the list, do:
desc.append(container.getText(strip=True))
Also, since it is already a list, you can remove the brackets from the DataFrame creation like so:
df = pd.DataFrame({'description': desc})
The cause is that no data is being added in the loop, so only the final data is being saved. For testing purposes, this code is now on page 2, so please fix it.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
all_data = pd.DataFrame(index=[], columns=['description'])
for page in range(1,3):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
df = pd.DataFrame({'description': [desc]})
all_data = pd.concat([all_data, df], ignore_index=True)
else:
break
all_data.to_csv('test4.csv', encoding = 'utf-8')
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))

Why does my web scraper only work half the time?

My goal is to get the product name and price of all Amazon pages detected in any website that I feed to my program.
My input is a text file containing five websites. In each of these websites, a total of five to fifteen amazon links are to be found.
My code is this:
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
from urllib2 import Request, urlopen, HTTPError, URLError
def isdead(url):
user_agent = 'Mozilla/20.0.1 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent':user_agent }
req = Request(url, headers = headers)
sleep(10)
try:
page_open = urlopen(req)
except HTTPError, e:
return e.code #404 if link is broken
except URLError, e:
return e.reason
else:
return False
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip()
#re.... is a list. if empty, available. if not, unavailable.
#return re.findall(r'Available from',AVAILABILITY[:30], re.IGNORECASE)
if len(re.findall(r'unavailable',AVAILABILITY[:30],re.IGNORECASE)) == 1:
return "unavailable"
else:
return "available"
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
all_links = [x for x in all_links if "amazon.com/gp/prime" not in x]
all_links = [y for y in all_links if "amazon.com/product-reviews" not in y]
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
if isdead(i) == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == "unavailable":
price = "UNAVAILABLE"
else:
if (len(soup.select("#priceblock_ourprice")) == 0) and (len(soup.select("#priceblock_saleprice")) == 0):
price = soup.select("#a-offscreen")
elif len(soup.select("#priceblock_ourprice")) == 0:
price = soup.select("#priceblock_saleprice")
else:
price = soup.select("#priceblock_ourprice")
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
Whenever it works fine, the output looks something like this (please don't judge the PRICE output, I have spent so much time trying to fix that but nothing works because I can't turn it into a string and get_text() doesn't work. This project is just for personal use so it's not that important, but if you have suggestions, I'm very receptive to those.):
LINK:
https://www.amazon.com/dp/B007Y6LLTM/ref=as_li_ss_tl?ie=UTF8&linkCode=ll1&tag=lunagtkf1-20&linkId=ee8c5299508af57c815ea6577ede4244
TITLE:Moen 7594ESRS Arbor Motionsense Two-Sensor Touchless One-Handle Pulldown Kitchen Faucet Featuring Power Clean, Spot Resist Stainless
PRICE:[<span class="a-size-medium a-color-price priceBlockBuyingPriceString" id="priceblock_ourprice">$359.99</span>]
/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/
... and so on.
The error looks like this:
Traceback (most recent call last):
File "name.py", line 75, in <module>
title = soup.select("#productTitle")[0].get_text().strip()
IndexError: list index out of range
It's so weird because there's a text file that's fed so many times and sometimes, all sites are scraped well, but sometimes, the error appears at the 10th Amazon product, sometimes, the error appears at the 1st product...
I'm suspecting it's a bot detection problem, but I have a header. What's the problem?
Your code is too messy. I've organized it for you, please check out if it works.
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
htmls = req.get(i)
doc = SimplifiedDoc(htmls)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
if response.status_code == 404:
print "DOES NOT EXIST"
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
pass
else:
html = response.text
doc = SimplifiedDoc(html)
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
You should learn more:) and give you an example of using the framework.
Here are more examples of simplified_scrapy here
If you need any help, please let me know.
from simplified_scrapy.spider import Spider, SimplifiedDoc
class MySpider(Spider):
name = 'amazon-product'
# allowed_domains = ['example.com']
start_urls = []
refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
filepath='' # Your file path
if filepath:
with open(filepath) as f:
start_urls = [line.rstrip('\n') for line in f]
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
amazon_links=None
data = None
if url['url'].find('https://www.amazon.com')>=0 or url['url'].find('https://amzn.to')>=0:
title = doc.getElementByID("productTitle").text
if doc.getElementByID('availability') and doc.getElementByID('availability').text.find('unavailable')>0:
price = "UNAVAILABLE"
else:
if doc.getElementByID("priceblock_ourprice"):
price = doc.getElementByID("priceblock_ourprice").text
elif doc.getElementByID("priceblock_saleprice"):
price = doc.getElementByID("priceblock_saleprice").text
else:
price = doc.getElementByID("a-offscreen").text
data = [{"title":title,'price':price}] # Get target data
print "TITLE:%s"%(title)
print "PRICE:%s"%(price)
print "/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/"
else:
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
amazon_links = amazon_links.notContains(['amazon.com/gp/prime','amazon.com/product-reviews'],attr='href')
return {"Urls": amazon_links, "Data": data} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling

Categories