I am scraping a website and everything seems work fine from today's news until news published in 2015/2016. After these years, I am not able to scrape news.
Could you please tell me if anything has changed?
I should get 672 pages getting titles and snippets from this page:
https://catania.liveuniversity.it/attualita/
but I have got approx. 158.
The code that I am using is:
import bs4, requests
import pandas as pd
import re
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page_num=1
website="https://catania.liveuniversity.it/attualita/"
while True:
r = requests.get(website, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html')
title=soup.find_all('h2')
date=soup.find_all('span', attrs={'class':'updated'})
if soup.find_all('a', attrs={'class':'page-numbers'}):
website = f"https://catania.liveuniversity.it/attualita/page/{page_num}"
page_num +=1
print(page_num)
else:
break
df = pd.DataFrame(list(zip(dates, titles)),
columns =['Date', 'Titles'])
I think there has been some changes in tags (for example in next page button, or just in the date/title tag).
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
def main(req, num):
r = req.get(
"https://catania.liveuniversity.it/attualita/page/{}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
try:
data = [(x.select_one("span.updated").text, x.findAll("a")[1].text, x.select_one("div.entry-content").get_text(strip=True)) for x in soup.select(
"div.col-lg-8.col-md-8.col-sm-8")]
return data
except AttributeError:
print(r.url)
return False
with ThreadPoolExecutor(max_workers=30) as executor:
with requests.Session() as req:
fs = [executor.submit(main, req, num) for num in range(1, 673)]
allin = []
for f in fs:
f = f.result()
if f:
allin.extend(f)
df = pd.DataFrame.from_records(
allin, columns=["Date", "Title", "Content"])
print(df)
df.to_csv("result.csv", index=False)
Related
Attempting to update my script so that it searches through not only the url provided but all of the pages in range (1-3) and adds them to the CSV. Can anyone spot why my current code would not be working? The addition to pages following 1 are in the following format: page-2
from bs4 import BeautifulSoup
import requests
from csv import writer
from random import randint
from time import sleep
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
for page in range(1, 4):
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for list in lists:
title = list.find('h2').text
price = list.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
sleep(randint(2,10))
You are overwrite req multiple times and end up only analyzing the results of page 2. Put everything inside your loop.
edit: Also the upper limit in range() is not included, so you probably want to do for page in range(1, 4): to get the first three pages.
edit full example:
from bs4 import BeautifulSoup
import requests
from csv import writer
url = "https://www.propertypal.com/property-for-sale/ballymena-area/page-"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for page in range(1, 4):
req = requests.get(f"{url}{page}", headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
for li in soup.find_all('li', class_="pp-property-box"):
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
The solution from bitflip is fine, however a few things I'll point out to help you.
try to avoid variable names that are predefined functions in python. For example list being one of those.
while csv writer is a fine package to use, also consider using pandas. You will likely further down the road need to do some data manipulation and what not, so might as well familiarise yourself with the package now. It's a very powerful tool.
Here's how I would have coded it.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
from time import sleep
from os.path import exists
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
# Check if csv file exists
file_exists = exists('ballymena.csv')
for page in range(1, 4):
rows = []
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
for li in lists:
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
row = {
'Address':title,
'Price':price}
rows.append(row)
df = pd.DataFrame(rows)
# If file doesnt exists, write initial file
if not file_exists:
df.to_csv('ballymena.csv', index=False)
file_exists = True
# If it already exists, ammend to file
else:
df.to_csv('ballymena.csv', mode = 'a', header = False, index = False)
sleep(randint(2,10))
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
url = "https://finance.naver.com/item/sise_day.nhn?code=068270&page=1"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
res = requests.get(url, verify=True, headers=headers)
with urlopen(url) as doc:
html = BeautifulSoup(res.text, 'lxml')
pgrr = html.find('td', class_='pgRR')
s = str(pgrr.a['href']).split('=')
last_page = s[-1]
df = pd.DataFrame()
sise_url = 'http://finance.naver.com/item/sise_day.nhn?code=068270'
for page in range(1, int(last_page)+1):
page_url = '{}&page={}'.format(sise_url, page)
df = df.append(pd.read_html(page_url, encoding='euc-kr', header='0')[0])
df = df.dropna() # 값이 빠진 행을 제거한다.
print(df)
I'm having this Value error while crawling the Daily stock data in Naver Finance.
I have no trouble getting the url but if i use the read_html() i have Value Error:Table not found issue from the line df = df.append(pd.read_html(page_url, encoding='euc-kr', header='0')[0]). Pls give some advice.
I don't read Korean... however pd.read_html() was getting an error page. Resolved this by requests.get() with headers. Then pass res.text to read_html()
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import pandas as pd
url = "https://finance.naver.com/item/sise_day.nhn?code=068270&page=1"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
res = requests.get(url, verify=True, headers=headers)
with urlopen(url) as doc:
html = BeautifulSoup(res.text, 'lxml')
pgrr = html.find('td', class_='pgRR')
s = str(pgrr.a['href']).split('=')
last_page = s[-1]
df = pd.DataFrame()
sise_url = 'http://finance.naver.com/item/sise_day.nhn?code=068270'
for page in range(1, int(last_page)+1):
page_url = '{}&page={}'.format(sise_url, page)
res = requests.get(page_url, verify=True, headers=headers)
df = df.append(pd.read_html(res.text, encoding='euc-kr')[0])
New to screen scraping here and this is my first time posting on stackoverflow. Aplogies in advance for any formatting errors in this post. Attempting to extract data from multiple pages with URL:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
For instance, page 1 is:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-1
Page 2:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-2
and so on...
My script is running without errors. However, my Pandas exported csv only contains 1 row with the first extracted value. At the time of this posting, the first value is:
14.01 Acres  Vestaburg, Montcalm County, MI$275,000
My intent is to create a spreadsheet with hundreds of rows that pull the property description from the URLs.
Here is my code:
import requests
from requests import get
from bs4 import BeautifulSoup
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
for page in range(1,900):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
else:
break
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))
import pandas as pd
df = pd.DataFrame({'description': [desc]})
df.to_csv('test4.csv', encoding = 'utf-8')
I suspect the problem is with the line reading desc = container.getText(strip=True) and have tried changing the line but keep getting errors when running.
Any help is appreciated.
I believe the mistake is in the line:
desc = container.getText(strip=True)
Every time it loops, the value in desc is replaced, not added on. To add items into the list, do:
desc.append(container.getText(strip=True))
Also, since it is already a list, you can remove the brackets from the DataFrame creation like so:
df = pd.DataFrame({'description': desc})
The cause is that no data is being added in the loop, so only the final data is being saved. For testing purposes, this code is now on page 2, so please fix it.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
all_data = pd.DataFrame(index=[], columns=['description'])
for page in range(1,3):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
df = pd.DataFrame({'description': [desc]})
all_data = pd.concat([all_data, df], ignore_index=True)
else:
break
all_data.to_csv('test4.csv', encoding = 'utf-8')
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))
I'm working on this csv (https://www.kaggle.com/jtrofe/beer-recipes) and I want to scrape every URL in data frame, but I can't because I have a problem/error, I'm not able to scrape all URL, if I try with 1 URL, it's ok and go, but with the function there is a problem... can someone help me?
This is my code:
import requests
from bs4 import BeautifulSoup
from time import sleep
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
base = 'https://www.brewersfriend.com'
links = [f'{base}{r}' for r in df['URL']]
while True:
try:
r = requests.get(links, headers=headers, stream=False, timeout=8).text
break
except:
if r.status_code == 404:
print("Client error")
r.raise_for_status()
sleep(1)
soup = BeautifulSoup(r, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
DEFAULT_VALUE = 'NaN'
if rating is None:
rating = DEFAULT_VALUE
print(rating.text)
I'm already know that in some page there isn't a rating and so I create the DEFAULT_VALURE with Not a Number, but maybe is an error too.
Before this code there is the data frame, but I don't put it too.
I hope someone can help me!
Thanks so much
All kinds of messy things here. I won;t go over all of it, but one thing I see is you are trying to print (rating.text). If youre rating is 'NaN', one error is that you can't do rating.text
This is not how I would write this up, but going off your initial coding:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
df = pd.read_csv('C:/recipeData/recipeData.csv', encoding = 'ISO-8859-1')
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
base = 'https://www.brewersfriend.com'
links = [f'{base}{r}' for r in df['URL']]
for link in links:
try:
r = requests.get(link, headers=headers, stream=False, timeout=8)
if r.status_code == 404:
print("Client error")
r.raise_for_status()
continue
else:
r = r.text
except:
continue
soup = BeautifulSoup(r, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'}).text
DEFAULT_VALUE = 'NaN'
if rating is None:
rating = DEFAULT_VALUE
print('%s: %s' %(link,rating))
Here is a way to do entire process
import requests, re
import pandas as pd
from bs4 import BeautifulSoup as bs
p = re.compile(r'dataviewToken":"(.*?)"')
p1 = re.compile(r'"rowCount":(\d+)')
results = []
i = 0
with requests.Session() as s:
r = s.get('https://www.kaggle.com/jtrofe/beer-recipes')
token = p.findall(r.text)[0]
rows = int(p1.findall(r.text)[0])
data = {"jwe":{"encryptedToken": token},"source":{"type":3,"dataset":{"url":"jtrofe/beer-recipes","tableType":1,"csv":{"fileName":"recipeData.csv","delimiter":",","headerRows":1}}},"select":["BeerID","Name","URL","Style","StyleID","Size(L)","OG","FG","ABV","IBU","Color","BoilSize","BoilTime","BoilGravity","Efficiency","MashThickness","SugarScale","BrewMethod","PitchRate","PrimaryTemp"],"skip":0,"take": rows}
base = 'https://www.brewersfriend.com'
r = s.post('https://www.kaggleusercontent.com/services/datasets/kaggle.dataview.v1.DataViewer/GetDataView', json = data).json()
names, links = zip(*[(row['text'][1], base + row['text'][2]) for row in r['dataView']['rows']])
for link in links:
r = s.get(link, headers = {'User-Agent' : 'Mozilla/5.0'})
if r.status_code == 403:
rating = 'N/A'
else:
soup = bs(r.content, 'lxml')
rating = soup.select_one('[itemprop=ratingValue]')
if rating is None:
rating = 'N/A'
else:
rating = rating.text
row = [names[i], rating]
results.append(row)
i+=1
df = pd.DataFrame(results, columns = ['Name', 'Rating'])
print(df.head())
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
I want to do web scraping on twitter page to download tweets on a specific search word. I am not able to fetch recursively all the tweets, rather I can fetch 20 tweets. Please help to fetch all the tweets recursively. Below is the code
from bs4 import BeautifulSoup
import requests
import pandas as pd
company_name = 'ABC'
url = 'https://twitter.com/search?q=%23%27%20%20%20' + company_name + '&src=typd&lang=en'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
req = requests.get(url, headers=headers);#print(req)
data = req.text;# print(data)
# soup = BeautifulSoup(data, "lxml");# print(soup)
soup = BeautifulSoup(data, "html.parser");# print(soup)
tweets = [p.text for p in soup.findAll('p', class_='tweet-text')]
# print(tweets)
df = pd.DataFrame()
df['Tweet'] = tweets
print(df.head())
print(df.shape)