Webscraping pdfs in Python in multiple links - python

I am trying to webscrape this website. To do so, I wrote the following code which works nicely:
from bs4 import BeautifulSoup
import pandas as pd
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('\n\n', strip=True)
})
pd.DataFrame(data)
However, if you for example open the first link of the page, there is a pdf in it. I would like to add to my dataframe - whenever there is a pdf in the link - the content of the pdf.
To do so, I have been looking around and I tried the following only on the first pdf of the first link:
import io
from PyPDF2 import PdfFileReader
def info(pdf_path):
response = requests.get(pdf_path)
with io.BytesIO(response.content) as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
txt = f"""
Information about {pdf_path}:
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
"""
print(txt)
return information
info('https://www.bis.org/review/r220708e.pdf')
However, it just gets the info (which I already have from the previous code), while it is missing the text. Ideally, I would like it to be part of the same code as above. I got stuck here.
Can anyone help me with this?
Thanks!

You need to return it, e.g. as a tuple :
return txt, information
If you want the text inside the pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() + "\n"

I'll allow you the pleasure of adapting this to your requests, sync scraping fashion (really not hard):
from PyPDF2 import PdfReader
...
async def get_full_content(url):
async with AsyncClient(headers=headers, timeout=60.0, follow_redirects=True) as client:
if url[-3:] == 'pdf':
r = await client.get(url)
with open(f'{url.split("/")[-1]}', 'wb') as f:
f.write(r.content)
reader = PdfReader(f'{url.split("/")[-1]}')
pdf_text = ''
number_of_pages = len(reader.pages)
for x in range(number_of_pages):
page = reader.pages[x]
text = page.extract_text()
pdf_text = pdf_text + text
And then you do something with the pdf_text extracted from .pdf (saving it into a db, reading it with pandas, nlp-ing it with Transformers/torch, etc).
Edit: one more thing: do a pip install -U pypdf2 as the package was recently updated (a few hours ago), just to make sure you're up to date.
Edit 2: A copy/pastable example, for a single .pdf file:
from PyPDF2 import PdfReader
import requests
url = 'https://www.bis.org/review/r220708e.pdf'
r = requests.get(url)
with open(f'{url.split("/")[-1]}', 'wb') as f:
f.write(r.content)
reader = PdfReader(f'{url.split("/")[-1]}')
pdf_text = ''
number_of_pages = len(reader.pages)
for x in range(number_of_pages):
page = reader.pages[x]
text = page.extract_text()
pdf_text = pdf_text + text
print(pdf_text)

Related

problem when download a image from website using beautiful soap

i am wrtiting this code to get information about top movies and also download the image blong to the movie but on some image they downloaded but their size are 0 but they have size on disk when i kilick on the link of the image that i cant download it well its opening well and there is no problem in link
for exampele this is one of the link that images :
https://static.stacker.com/s3fs-public/styles/slide_desktop/s3/00000116_4_0.png
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = "https://stacker.com/stories/1587/100-best-movies-all-time"
count = 0
local_description = ""
movie_data = []
data = requests.get(URL).text
soap = BeautifulSoup(data, "html.parser")
titles = soap.find_all(name="h2", class_="ct-slideshow__slide__text-container__caption")[1:]
description = soap.find_all(name="div", class_="ct-slideshow__slide__text-container__description")[1:]
images = soap.find_all(name="img", typeof="foaf:Image")[6:106]
for num in range(100):
movie_name = titles[num].getText().replace("\n", "")
local_des = description[num].find_all(name="p")[1:]
for s in local_des:
local_description = s.getText().replace(" ", "")
local_data = {"title": movie_name, "description": local_description}
movie_data.append(local_data)
movie_image_link = images[num].get("src")
response = requests.get(movie_image_link)
with open(f"images/{movie_name}.png", 'wb') as f:
f.write(response.content)
count += 1
print(count)
data_collected = pd.DataFrame(movie_data)
data_collected.to_csv("Data/100_movie.csv", index=False)
i found my problem in some movie name there was ":" and as you knwo you cant user ":"
in file names i fix the code with .replace()
movie_name.replace(":", "")
Once you get a response, check if it's empty before writing to disk. Might need to retry or the link may be bad.

Write list values to CSV as they pertain to current iteration

I'm trying to write a list to a csv file such that the it comes out looking like this
I'm sure I'm not using the CSV library correctly since it prints each character of just the first link to the file Here's my code:
for t in terms:
fields = ["Search Term", "URL"]
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
for item in soup.find_all("item"):
link= str(item)
i = link.find("<link/>")
j = link.find("<guid")
links = link[i+7:j]
with open("urls.csv", "w") as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(links)
Any help would be so appreciated. Thanks!!
Use xml parser when creating the soup:
import csv
import requests
from bs4 import BeautifulSoup
terms = ["refrigerator", "kitchen sink"]
with open("urls.csv", "w") as f_out:
writer = csv.writer(f_out)
writer.writerow(["Search Term", "URL"])
for t in terms:
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
print(f"Getting {url}")
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, "xml")
for item in soup.find_all("link"):
writer.writerow([t, item.get_text(strip=True)])
Creates urls.csv (screenshot from LibreOffice):

How to write code to read output file to figure out how far it got in scraping website and then starting from where it left off

I'm writing a program to scrape article title, date and body text from each article on this website's archive and export to a csv file. The website seems to block me at some point and I get this error: HTTPError: Service Unavailable.
I believe this is because I am trying to access their website too many times in a short amount of time. I want my code to be able to read where the error happened and pick up where it left off.
I've tried adding delays to delay 2 seconds after going through 10 articles. I've also tried random delays after every ten articles. I could add longer delays but I want the code to be able to pick up where it felt off to be foolproof.
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
from time import sleep
from random import randint
csvfile = "C:/Users/k/Dropbox/granularitygrowth/Politico/pol.csv"
with open(csvfile, mode='w', newline='', encoding='utf-8') as pol:
csvwriter = csv.writer(pol, delimiter='~', quoting=csv.QUOTE_MINIMAL)
csvwriter.writerow(["Date", "Title", "Article"])
#for each page on Politico archive
for p in range(0,412):
url = urlopen("https://www.politico.com/newsletters/playbook/archive/%d" % p)
content = url.read()
#Parse article links from page
soup = BeautifulSoup(content,"lxml")
articleLinks = soup.findAll('article', attrs={'class':'story-frag format-l'})
#Each article link on page
for article in articleLinks:
link = article.find('a', attrs={'target':'_top'}).get('href')
#Open and read each article link
articleURL = urlopen(link)
articleContent = articleURL.read()
#Parse body text from article page
soupArticle = BeautifulSoup(articleContent, "lxml")
#Limits to div class = story-text tag (where article text is)
articleText = soupArticle.findAll('div', attrs={'class':'story-text'})
for div in articleText:
#Find date
footer = div.find('footer', attrs={'class':'meta'})
date = footer.find('time').get('datetime')
print(date)
#Find title
headerSection = div.find('header')
title = headerSection.find('h1').text
print(title)
#Find body text
textContent = ""
bodyText = div.findAll('p')
for p in bodyText:
p_string = str(p.text)
textContent += p_string + ' '
print(textContent)
#Adds data to csv file
csvwriter.writerow([date, title, textContent])
time.sleep(randint(3,8))
I expect my code to still have this error but then pick up from where it left off and continue printing and exporting data to csv file.
You can count the number of articles you've saved in CSV, int divide it by 10 (page = 1 + records // 10 (+1 is for the first page)) to get the last page you were at.
I've refactored your code like this:
import csv
import time
from random import randint
from urllib.request import urlopen
from bs4 import BeautifulSoup
HEADERS = ["Date", "Title", "Article"]
def count_rows(csv_path: str) -> int:
with open(csv_path) as f:
reader = csv.DictReader(f)
return len(list(reader))
def write_articles(csv_path: str, articles: list):
# note the append mode, write mode would delete everything and start fresh
with open(csv_path, 'a', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f,
quoting=csv.QUOTE_MINIMAL,
fieldnames=HEADERS)
writer.writerows(articles)
def init_csv(csv_path: str):
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=HEADERS, quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
def get_page_soup(url: str) -> BeautifulSoup:
response = urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "lxml")
return soup
def scrape_article(url: str) -> dict:
article_soup = get_page_soup(url)
# Limits to div class = story-text tag (where article text is)
story_el = article_soup.select_one('.story-text')
# find date
date = story_el.select_one('.timestamp time')['datetime']
# find title
title = story_el.find('h1').text
# find body text
article_text = ''
for p in story_el.find_all('p'):
article_text += p.text + ' '
return {
'Title': title,
'Date': date,
'Article': article_text
}
def main():
csvfile = "test.csv"
try:
record_count = count_rows(csvfile)
except FileNotFoundError:
init_csv(csvfile)
print('Initialized CSV file')
record_count = 0
article_per_page = 10
page = 1 + record_count // article_per_page
print('Continuing from page', page)
articles = []
for p in range(page, 413):
url = "https://www.politico.com/newsletters/playbook/archive/%d" % p
soup = get_page_soup(url)
article_links = soup.select('article.story-frag.format-l')
# Each article link on page
for article in article_links:
link = article.select_one('a[target=_top]')['href']
scraped_article = scrape_article(link)
print(scraped_article)
articles.append(scraped_article)
write_articles(csvfile, articles)
print('Finished page', p)
time.sleep(randint(3, 8))
if __name__ == '__main__':
main()
this gives you an output like this:
Finished page 48
{'Title': 'Playbook: Scalise takes several Republicans to ...
{'Title': 'Playbook: Four unfolding events that show the ...
{'Title': 'Playbook: Texas kicks off primary season, as D ...
{'Title': 'Playbook: The next gen: McCarthy and Crowley’s ...
{'Title': 'INSIDE THE GRIDIRON DINNER: What Trump said an ...
{'Title': 'DEMS spending millions already to boost vulner ...
{'Title': 'Playbook: Inside the Republican super PAC mone ...
{'Title': 'Playbook: Who would want to be White House com ...
{'Title': "Playbook: Jared Kushner's bad day", 'Date': '2 ...
{'Title': 'Playbook: Gun control quickly stalls in the Se ...
Finished page 49

How to add pictures to docx python from URL?

I am having trouble with the Python Docx Library, I have scraped images from a website and I want to add them to docx but I cannot add the images to docx directly, I keep getting an error:
File "C:\Python27\lib\site-packages\docx\image\image.py", line 46, in
from_file
with open(path, 'rb') as f: IOError: [Errno 22] invalid mode ('rb') or filename:
'http://upsats.com/Content/Product/img/Product/Thumb/PCB2x8-.jpg'
This is my code:
import urllib
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
import os
document = Document()
document.add_heading("Megatronics Items Full Search", 0)
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters', 'Breakout-Boards', 'RC-Wireless-communication', 'GSM,-GPS,-RFID,-Wifi',
'Advance-Development-boards-and-starter-Kits', 'Sensors-and-IMU', 'Solenoid-valves,-Relays,--Switches',
'Motors,-drivers,-wheels', 'Microcontrollers-and-Educational-items', 'Arduino-Shields',
'Connectivity-Interfaces', 'Power-supplies,-Batteries-and-Chargers', 'Programmers-and-debuggers',
'LCD,-LED,-Cameras', 'Discrete-components-IC', 'Science-Education-and-DIY', 'Consumer-Electronics-and-tools',
'Mechanical-parts', '3D-Printing-and-CNC-machines', 'ATS', 'UPS', 'Internal-Battries-UPS',
'External-Battries-UPS']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib.urlopen(URL)
bsObj = BeautifulSoup(page, "lxml")
panel = bsObj.findAll("div", {"class": "panel"})
for div in panel:
titleList = div.find('div', attrs={'class': 'panel-heading'})
imageList = div.find('div', attrs={'class': 'pro-image'})
descList = div.find('div', attrs={'class': 'pro-desc'})
r = requests.get("http://upsats.com/", stream=True)
data = r.text
for link in imageList.find_all('img'):
image = link.get("src")
image_name = os.path.split(image)[1]
r2 = requests.get(image)
with open(image_name, "wb") as f:
f.write(r2.content)
print(titleList.get_text(separator=u' '))
print(imageList.get_text(separator=u''))
print(descList.get_text(separator=u' '))
document.add_heading("%s \n" % titleList.get_text(separator=u' '))
document.add_picture(image, width=Inches(1.5))
document.add_paragraph("%s \n" % descList.get_text(separator=u' '))
document.save('megapy.docx')
Not all of it but just the main part. Now, I am having problems copying the pictures that I downloaded, I want to copy it to docx. I do not know how to add the picture. How do I convert it? I think I have to format it but how do I do that?
All I know is the problem lies within this code:
document.add_picture(image, width=Inches(1.0))
How do I make this image show up in docx from the URL? What am I missing?
Update
I did a test with 10 images and I got a docx. When loading many I had an error at one place and I overwrote that by adding a try, except (see below). The resulting megapy.docx got 165 MB big and took about 10 minutes to create.
with open(image_name, "wb") as f:
f.write(r2.content)
To:
image = io.BytesIO(r2.content)
And added:
try:
document.add_picture(image, width=Inches(1.5))
except:
pass
Use io library to create file-like ojects.
Example that works on python2&3:
import requests
import io
from docx import Document
from docx.shared import Inches
url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Usain_Bolt_Rio_100m_final_2016k.jpg/200px-Usain_Bolt_Rio_100m_final_2016k.jpg'
response = requests.get(url, stream=True)
image = io.BytesIO(response.content)
document = Document()
document.add_picture(image, width=Inches(1.25))
document.save('demo.docx')

Get authors name and URL for tag from google scholar

I wish to write to a CSV file a list of all authors with their URL to a CSV file who class themselves as a specific tag on Google Scholar. For example, if we were to take 'security' I would want this output:
author url
Howon Kim https://scholar.google.pl/citations?user=YUoJP-oAAAAJ&hl=pl
Adrian Perrig https://scholar.google.pl/citations?user=n-Oret4AAAAJ&hl=pl
... ...
I have written this code which prints each author's name
# -*- coding: utf-8 -*-
import urllib.request
import csv
from bs4 import BeautifulSoup
url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
mydivs = soup.findAll("h3", { "class" : "gsc_1usr_name"})
outputFile = open('sample.csv', 'w', newline='')
outputWriter = csv.writer(outputFile)
for each in mydivs:
for anchor in each.find_all('a'):
print (anchor.text)
However, this only does it for the first page. Instead, I would like to go through every page. How can I do this?
I'm not writing the code for you.. but I'll give you an outline for how you can.
Look at the bottom of the page. See the next button? Search for it the containing div has an id of gsc_authors_bottom_pag which should be easy to find. I'd do this with selenium, find the next button (right) and click it. Wait for the page to load, scrape repeat. Handle edge cases (out of pages, etc).
If the after_author=* bit didn't change in the url you could just increment the url start.. but unless you want to try to crack that code (unlikely) then just click the next button.
This page use <button> instead of <a> for link to next/previous page.
Button to next page has aria-label="Następna".
There are two buttons to next page but you can use any of them.
Button has JavaScript code to redirect to new page
window.location=url_to_next_page
but it is simple text so you can use slicing to get only url
import urllib.request
from bs4 import BeautifulSoup
url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security"
while True:
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
# ... do something on page ...
# find buttons to next page
buttons = soup.findAll("button", {"aria-label": "Następna"})
# exit if no buttons
if not buttons:
break
on_click = buttons[0].get('onclick')
print('javascript:', on_click)
#add `domain` and remove `window.location='` and `'` at the end
url = 'http://scholar.google.pl' + on_click[17:-1]
# converting some codes to chars
url = url.encode('utf-8').decode('unicode_escape')
print('url:', url)
BTW: if you speak Polish then you can visit on Facebook: Python Poland or Python: pierwsze kroki
Since furas is already answered on how to loop through all pages, this is a complementary answer to his answer. The script below scrapes much more than your question asks and scrapes to a .csv file.
Code and example in online IDE:
from bs4 import BeautifulSoup
import requests, lxml, os, csv
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
def get_profiles_to_csv():
html = requests.get('http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:security', headers=headers).text
soup = BeautifulSoup(html, 'lxml')
# creating CSV File
with open('awesome_file.csv', mode='w') as csv_file:
# defining column names
fieldnames = ['Author', 'URL']
# defining .csv writer
# https://docs.python.org/3/library/csv.html#csv.DictWriter
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
# writing (creating) columns
writer.writeheader()
# collecting scraped data
author_data = []
# Selecting container where all data located
for result in soup.select('.gs_ai_chpr'):
name = result.select_one('.gs_ai_name a').text
link = result.select_one('.gs_ai_name a')['href']
# https://stackoverflow.com/a/6633693/15164646
# id = link
# id_identifer = 'user='
# before_keyword, keyword, after_keyword = id.partition(id_identifer)
# author_id = after_keyword
# affiliations = result.select_one('.gs_ai_aff').text
# email = result.select_one('.gs_ai_eml').text
# try:
# interests = result.select_one('.gs_ai_one_int').text
# except:
# interests = None
# "Cited by 107390" = getting text string -> splitting by a space -> ['Cited', 'by', '21180'] and taking [2] index which is the number.
# cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]
# because we have a csv.DictWriter() we converting to the required format
# dict() keys should be exactly the same as fieldnames, otherwise it will throw an error
author_data.append({
'Author': name,
'URL': f'https://scholar.google.com{link}',
})
# iterating over celebrity data list() that became dict() and writing it to the .csv
for data in author_data:
writer.writerow(data)
# print(f'{name}\nhttps://scholar.google.com{link}\n{author_id}\n{affiliations}\n{email}\n{interests}\n{cited_by}\n')
# output from created csv:
'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=pl&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=pl&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=pl&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=pl&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=pl&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=pl&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=pl&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=pl&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=pl&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=pl&user=ty7wIXoAAAAJ
'''
Alternatively, you can do the same thing using Google Scholar Profiles API from SerpApi. It's a paid API with a free plan.
Code to integrate:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import csv, os
def get_profiles_to_csv():
with open('awesome_serpapi_file_pagination.csv', mode='w') as csv_file:
fieldnames = ['Author', 'URL']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_profiles",
"mauthors": "label:security"
}
search = GoogleSearch(params)
while True:
results = search.get_dict()
try:
for result in results['profiles']:
name = result['name']
link = result['link']
writer.writerow({'Author': name, 'URL': link})
except:
print('Done')
break
if (not 'pagination' in results) and (not 'next' in results['pagination']):
break
search.params_dict.update(dict(parse_qsl(urlsplit(results["pagination"]["next"]).query)))
get_profiles_to_csv()
# part of the output from created csv:
'''
Author,URL
Johnson Thomas,https://scholar.google.com/citations?hl=en&user=eKLr0EgAAAAJ
Martin Abadi,https://scholar.google.com/citations?hl=en&user=vWTI60AAAAAJ
Adrian Perrig,https://scholar.google.com/citations?hl=en&user=n-Oret4AAAAJ
Vern Paxson,https://scholar.google.com/citations?hl=en&user=HvwPRJ0AAAAJ
Frans Kaashoek,https://scholar.google.com/citations?hl=en&user=YCoLskoAAAAJ
Mihir Bellare,https://scholar.google.com/citations?hl=en&user=2pW1g5IAAAAJ
Matei Zaharia,https://scholar.google.com/citations?hl=en&user=I1EvjZsAAAAJ
John A. Clark,https://scholar.google.com/citations?hl=en&user=xu3n6owAAAAJ
Helen J. Wang,https://scholar.google.com/citations?hl=en&user=qhu-DxwAAAAJ
Zhu Han,https://scholar.google.com/citations?hl=en&user=ty7wIXoAAAAJ
'''
Disclaimer, I work for SerpApi.

Categories