News site scrape with Python - python

I am trying to scrape some news. I have a larger list of 3k articles from this site, selected by criteria, and (considering I am new to Python) I came out with this script to scrape them:
import pandas as pd
import bs4
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
# get the URL list
list1 = []
a = 'https://www.dnes.bg/sofia/2019/03/13/borisov-se-pohvali-prihodite-ot-gorivata-sa-sys-7-poveche.404467'
b = 'https://www.dnes.bg/obshtestvo/2019/03/13/pazim-ezika-si-pravopis-pod-patronaja-na-radeva.404462'
c = 'https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091'
list1.append(a)
list1.append(b)
list1.append(c)
# define the variables
#url = "https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091"
list2 = list1 #[0:10]
#type(list2)
href = []
title = []
subtitle = []
time = []
article = []
art1 = []
#
#dd = soup.find("div", "art_author").text
#dd
filename = "scraped.csv"
f = open(filename, "w")
#headers = "href;title;subtitle;time;article\n"
headers = "title;subtitle;time;article\n"
f.write(headers)
for url in list2:
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml').decode('windows-1251')
href = url
title = soup.find("h1", "title").string
#title = soup.find("h1", "title").string
#title.extend(soup.find("h1", "title").string) # the title string
subtitle = soup.find("div", "descr").string
#subtitle.extend(soup.find("div", "descr").string) # the subtitle string
time = soup.find("div", "art_author").text
#time.extend(soup.find("div", "art_author").text)
#par = soup.find("div", id="art_start").find_all("p")
art1.extend(soup.find("div", id="art_start").find_all("p"))
for a in art1:
#article.extend(art1.find_all("p"))
article = ([a.text.strip()])
break
#href = "".join(href)
title = "".join(title)
subtitle = "".join(subtitle)
time = "".join(time)
article = "".join(article)
#f.write(href + ";" + title + ";" + subtitle + ";" + time + ";" + article + "\n")
f.write(title + ";" + subtitle + ";" + time + ";" + article +"\n")
f.close()
The main problem for now is that I get an error:
File "<ipython-input-12-9a796b182a82>", line 24, in <module>
title = soup.find("h1", "title").string
TypeError: slice indices must be integers or None or have an __index__ method
I can't really find a solution to this.
And the second problem is whenever I succeed scraping one site, some empty cells occur, which means I have to find a way through Ajax.
I use the Anaconda version 2018.12.

Something I stumbled upon ([here] https://www.youtube.com/watch?v=FSH77vnOGqU ):
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('https://pythonprogramming.net/parsememcparseface/')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find('p', class_='jstest')
print(js_test.text)
if __name__ == '__main__': main()

Ok. I fixed the issue with your soup object stored as a string, so you can use bs4 to parse the html. I also opted to use pandas .to_csv(), as I'm just more familiar with it, but it gets you the desired output:
import pandas as pd
from bs4 import BeautifulSoup
import requests
# get the URL list
list1 = []
a = 'https://www.dnes.bg/sofia/2019/03/13/borisov-se-pohvali-prihodite-ot-gorivata-sa-sys-7-poveche.404467'
b = 'https://www.dnes.bg/obshtestvo/2019/03/13/pazim-ezika-si-pravopis-pod-patronaja-na-radeva.404462'
c = 'https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091'
list1.append(a)
list1.append(b)
list1.append(c)
# define the variables
#url = "https://www.dnes.bg/politika/2019/01/03/politikata-nekanen-gost-na-praznichnata-novogodishna-trapeza.398091"
list2 = list1 #[0:10]
#type(list2)
results = pd.DataFrame()
for url in list2:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
href = url
title = soup.find("h1", "title").text
#title = soup.find("h1", "title").string
#title.extend(soup.find("h1", "title").string) # the title string
subtitle = soup.find("div", "descr").text
#subtitle.extend(soup.find("div", "descr").string) # the subtitle string
time = soup.find("div", "art_author").text
#time.extend(soup.find("div", "art_author").text)
#par = soup.find("div", id="art_start").find_all("p")
art1 = soup.find("div", id="art_start").find_all("p")
article = []
for a in art1:
if 'googletag.cmd.push' not in a.text:
article.append(a.text.strip())
article = ' '.join(article)
temp_df = pd.DataFrame([[title, subtitle, time, article]], columns = ['title','subtitle','time','article'])
results = results.append(temp_df).reset_index(drop=True)
results.to_csv("scraped.csv", index=False, encoding='utf-8-sig')
Output:
print (results.to_string())
title subtitle time article
0 Борисов се похвали: Приходите от горивата са с... Мерките за изсветляване на сектора действат, к... Обновена: 13 мар 2019 13:24 | 13 мар 2019 11:3... Приходите от горивата са със 7% повече. Това с...
1 "Пазим езика си": Правопис под патронажа на Ра... Грамотността зависи не само от училището, смят... Обновена: 13 мар 2019 11:34 | 13 мар 2019 11:2... За втора поредна година Сдружение "Живата вода...
2 Политиката – "неканен гост" на празничната нов... Основателни ли бяха критиките на президента Ру... 3 яну 2019 10:45, Цветелин Димитров Оказа ли се политиката "неканен гост" на празн...

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:
The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

how to scrape texts from voetsmart via beautifulsoup

I am trying to scrape some statements made by U.S politicians on votesmart.org
I am experiencing errors in extracting the texts though the code could be run.
The code that I am using is as follow:
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import os
def main():
df=pd.read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
df = df[df.type=='sen']
df = df[~df.votesmart_id.isna()]
done_list = os.listdir('corpus')
print("{} senators".format(len(df)))
df = df[~df.full_name.isin(done_list)]
print("{} after some already done".format(len(df)))
df = df.sample(frac=1)
df.apply(scrape_politician_speeches,axis=1)
def scrape_politician_speeches(row):
print('Scraping {}...'.format(row.full_name))
vs_url='https://justfacts.votesmart.org/candidate/public-statements/{}'.format(int(row.votesmart_id))
vs_page = requests.get(vs_url) # fill in the last part of the url
soup = BeautifulSoup(vs_page.content, features="lxml")
n_pages = 1
page_num = 1
while page_num <= n_pages:
print("\tPage {} of {}".format(page_num,n_pages))
#speeches_url = vs_page.url + '?start=2019-01-01&speechType=14&p={}'.format(page_num)
speeches_url = vs_page.url + '/?s=date&start=2020/01/01&end=&p={}'.format(page_num)
speeches_page = requests.get(speeches_url)
soup = BeautifulSoup(speeches_page.content, features="lxml")
speech_table = soup.find('table', {'id':'statementsObjectsTables'})
speech_table = soup.find('tbody')
speech_links = speech_table.find_all('a',href=True)
speech_hrefs = [a.get('href') for a in speech_links]
for href in speech_hrefs:
scrape_speech(person=row.full_name, speech_url=href)
try:
n_pages = int(soup.find('h7').text.split()[-1])
except:
print("\tNo page numbers")
pass
page_num += 1
sleep(1)
def scrape_speech(person, speech_url):
try:
if not os.path.isdir('corpus/{}'.format(person)):
os.mkdir('corpus/{}'.format(person))
speech_page = requests.get(speech_url)
soup = BeautifulSoup(speech_page.content,features="lxml")
title = soup.find('h3').text
date = soup.find('span',{'itemprop':'datePublished'}).text
location = soup.find('span',{'itemprop':'contentLocation'}).text
body = soup.find('div', {'class':"main clear"})
p_list = body.find_all('p')
text_list = [p.text for p in p_list]
speech_text = '\n\n'.join(text_list)
full_text = '{}\n\n\n{}'.format(title,speech_text)
file_name = '{}, {}, {}.txt'.format(title.split(',')[0], date, location)
file_name = file_name.replace('/',' ')
with open('corpus/{}/{}'.format(person,file_name), 'w') as f:
f.write(full_text)
except:
print("\tError with {}".format(speech_url))
if __name__=='__main__':
main()
The errors are looking like this:
95 senators
95 after some already done
Scraping Tammy Duckworth...
Page 1 of 1
Error with https://votesmart.org/public-statement/1570841/durbin-duckworth-announce-135-million-for-springfield-rail-improvement-project
Error with https://votesmart.org/public-statement/1570825/durbin-duckworth-statement-on-nomination-of-ladon-reynolds-to-serve-as-us-marshal-for-the-northern-district-of-illinois
Error with https://votesmart.org/public-statement/1570826/durbin-duckworth-announce-16-million-in-telehealth-funding-for-illinois-health-care-providers
Thank you so much for your time and attention. I hope to learn more from this wonderful community.
scrape_speech is outdated, probably pages' design changed since script was writen, there's no <div class="main clear"> in html, there's no <span itemprop="datePublished"> and so on. You need to rewrite it using current css selectors.

Python web scraping empty result

I followed a youtube tutorial on web scraping to scrape this website https://books.toscrape.com/ but i'm getting an empty result
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
all_books = []
url = "http://books.toscrape.com/catalogue/page-1.html"
def get_page(url):
page = requests.get(url)
status = page.status_code
soup = bs(page.text, "lxml")
return [soup, status]
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_info(links):
for listing in listings:
bk_lnk = listing.find("h5").a.get("href")
base_url = "http://books.toscrape.com/catalogue"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1. text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p. text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 1
while True:
url = f"http://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
df = pd.DataFrame(all_books)
print (df)
here's the result am getting
Empty DataFrame
Columns: []
Index: []
my colab notebook link
https://colab.research.google.com/drive/1Lyvwt_WLpE9tqy1qheZg80N70CFSsk-E?usp=sharing
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_links():
for listing in listings:
bk_lnk = listing.find("h3").a.get("href")
base_url = "https://books.toscrape.com/catalogue/"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
return extract_links()
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1.text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p.text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 45
while True:
url = f"https://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
Your list is empty . Need to call your functions .. such as
Get_page(url) which should return a list which you can use soup in your subsequent function ..

Iterate Over URLs Using BeautifulSoup

I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,

Extract text from <li>

I extracted text from a blog, but in this format:
<li><b><a href="https://www.bloomberg.com/news/articles/article-id">Text</li>
I need to extract only text from it.
I tried this code:
from bs4 import BeautifulSoup
from urllib import request
import nltk, re, pprint
def getAllDoxyDonkeyPosts(url,links):
raw = request.urlopen(url).read()
raw = BeautifulSoup(raw, "lxml")
for a in raw.findAll('a'):
try:
url = a['href']
title = a['title']
if title == "Older Posts":
links.append(url)
getAllDoxyDonkeyPosts(url,links)
except:
title = ""
return
blogUrl = "http://doxydonkey.blogspot.in/"
links = []
getAllDoxyDonkeyPosts(blogUrl,links)
def getDoxyDonkeyText(url):
raw = request.urlopen(url).read()
raw = BeautifulSoup(raw, "lxml")
mydivs = raw.findAll("div", {"class":'post-body'})
posts = []
for div in mydivs:
posts+=div.findAll("li")
return posts
doxyDonkeyPosts = []
for link in links:
doxyDonkeyPosts+=getDoxyDonkeyText(link)
doxyDonkeyPosts
from bs4 import BeautifulSoup
t = '<li><b>Google</b></li>'
b = BeautifulSoup(t, "html.parser")
print b.text #--> Google

Categories