I am having trouble with the Python Docx Library, I have scraped images from a website and I want to add them to docx but I cannot add the images to docx directly, I keep getting an error:
File "C:\Python27\lib\site-packages\docx\image\image.py", line 46, in
from_file
with open(path, 'rb') as f: IOError: [Errno 22] invalid mode ('rb') or filename:
'http://upsats.com/Content/Product/img/Product/Thumb/PCB2x8-.jpg'
This is my code:
import urllib
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
import os
document = Document()
document.add_heading("Megatronics Items Full Search", 0)
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters', 'Breakout-Boards', 'RC-Wireless-communication', 'GSM,-GPS,-RFID,-Wifi',
'Advance-Development-boards-and-starter-Kits', 'Sensors-and-IMU', 'Solenoid-valves,-Relays,--Switches',
'Motors,-drivers,-wheels', 'Microcontrollers-and-Educational-items', 'Arduino-Shields',
'Connectivity-Interfaces', 'Power-supplies,-Batteries-and-Chargers', 'Programmers-and-debuggers',
'LCD,-LED,-Cameras', 'Discrete-components-IC', 'Science-Education-and-DIY', 'Consumer-Electronics-and-tools',
'Mechanical-parts', '3D-Printing-and-CNC-machines', 'ATS', 'UPS', 'Internal-Battries-UPS',
'External-Battries-UPS']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib.urlopen(URL)
bsObj = BeautifulSoup(page, "lxml")
panel = bsObj.findAll("div", {"class": "panel"})
for div in panel:
titleList = div.find('div', attrs={'class': 'panel-heading'})
imageList = div.find('div', attrs={'class': 'pro-image'})
descList = div.find('div', attrs={'class': 'pro-desc'})
r = requests.get("http://upsats.com/", stream=True)
data = r.text
for link in imageList.find_all('img'):
image = link.get("src")
image_name = os.path.split(image)[1]
r2 = requests.get(image)
with open(image_name, "wb") as f:
f.write(r2.content)
print(titleList.get_text(separator=u' '))
print(imageList.get_text(separator=u''))
print(descList.get_text(separator=u' '))
document.add_heading("%s \n" % titleList.get_text(separator=u' '))
document.add_picture(image, width=Inches(1.5))
document.add_paragraph("%s \n" % descList.get_text(separator=u' '))
document.save('megapy.docx')
Not all of it but just the main part. Now, I am having problems copying the pictures that I downloaded, I want to copy it to docx. I do not know how to add the picture. How do I convert it? I think I have to format it but how do I do that?
All I know is the problem lies within this code:
document.add_picture(image, width=Inches(1.0))
How do I make this image show up in docx from the URL? What am I missing?
Update
I did a test with 10 images and I got a docx. When loading many I had an error at one place and I overwrote that by adding a try, except (see below). The resulting megapy.docx got 165 MB big and took about 10 minutes to create.
with open(image_name, "wb") as f:
f.write(r2.content)
To:
image = io.BytesIO(r2.content)
And added:
try:
document.add_picture(image, width=Inches(1.5))
except:
pass
Use io library to create file-like ojects.
Example that works on python2&3:
import requests
import io
from docx import Document
from docx.shared import Inches
url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Usain_Bolt_Rio_100m_final_2016k.jpg/200px-Usain_Bolt_Rio_100m_final_2016k.jpg'
response = requests.get(url, stream=True)
image = io.BytesIO(response.content)
document = Document()
document.add_picture(image, width=Inches(1.25))
document.save('demo.docx')
Related
i am wrtiting this code to get information about top movies and also download the image blong to the movie but on some image they downloaded but their size are 0 but they have size on disk when i kilick on the link of the image that i cant download it well its opening well and there is no problem in link
for exampele this is one of the link that images :
https://static.stacker.com/s3fs-public/styles/slide_desktop/s3/00000116_4_0.png
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = "https://stacker.com/stories/1587/100-best-movies-all-time"
count = 0
local_description = ""
movie_data = []
data = requests.get(URL).text
soap = BeautifulSoup(data, "html.parser")
titles = soap.find_all(name="h2", class_="ct-slideshow__slide__text-container__caption")[1:]
description = soap.find_all(name="div", class_="ct-slideshow__slide__text-container__description")[1:]
images = soap.find_all(name="img", typeof="foaf:Image")[6:106]
for num in range(100):
movie_name = titles[num].getText().replace("\n", "")
local_des = description[num].find_all(name="p")[1:]
for s in local_des:
local_description = s.getText().replace(" ", "")
local_data = {"title": movie_name, "description": local_description}
movie_data.append(local_data)
movie_image_link = images[num].get("src")
response = requests.get(movie_image_link)
with open(f"images/{movie_name}.png", 'wb') as f:
f.write(response.content)
count += 1
print(count)
data_collected = pd.DataFrame(movie_data)
data_collected.to_csv("Data/100_movie.csv", index=False)
i found my problem in some movie name there was ":" and as you knwo you cant user ":"
in file names i fix the code with .replace()
movie_name.replace(":", "")
Once you get a response, check if it's empty before writing to disk. Might need to retry or the link may be bad.
I am trying to webscrape this website. To do so, I wrote the following code which works nicely:
from bs4 import BeautifulSoup
import pandas as pd
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('\n\n', strip=True)
})
pd.DataFrame(data)
However, if you for example open the first link of the page, there is a pdf in it. I would like to add to my dataframe - whenever there is a pdf in the link - the content of the pdf.
To do so, I have been looking around and I tried the following only on the first pdf of the first link:
import io
from PyPDF2 import PdfFileReader
def info(pdf_path):
response = requests.get(pdf_path)
with io.BytesIO(response.content) as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
txt = f"""
Information about {pdf_path}:
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
"""
print(txt)
return information
info('https://www.bis.org/review/r220708e.pdf')
However, it just gets the info (which I already have from the previous code), while it is missing the text. Ideally, I would like it to be part of the same code as above. I got stuck here.
Can anyone help me with this?
Thanks!
You need to return it, e.g. as a tuple :
return txt, information
If you want the text inside the pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() + "\n"
I'll allow you the pleasure of adapting this to your requests, sync scraping fashion (really not hard):
from PyPDF2 import PdfReader
...
async def get_full_content(url):
async with AsyncClient(headers=headers, timeout=60.0, follow_redirects=True) as client:
if url[-3:] == 'pdf':
r = await client.get(url)
with open(f'{url.split("/")[-1]}', 'wb') as f:
f.write(r.content)
reader = PdfReader(f'{url.split("/")[-1]}')
pdf_text = ''
number_of_pages = len(reader.pages)
for x in range(number_of_pages):
page = reader.pages[x]
text = page.extract_text()
pdf_text = pdf_text + text
And then you do something with the pdf_text extracted from .pdf (saving it into a db, reading it with pandas, nlp-ing it with Transformers/torch, etc).
Edit: one more thing: do a pip install -U pypdf2 as the package was recently updated (a few hours ago), just to make sure you're up to date.
Edit 2: A copy/pastable example, for a single .pdf file:
from PyPDF2 import PdfReader
import requests
url = 'https://www.bis.org/review/r220708e.pdf'
r = requests.get(url)
with open(f'{url.split("/")[-1]}', 'wb') as f:
f.write(r.content)
reader = PdfReader(f'{url.split("/")[-1]}')
pdf_text = ''
number_of_pages = len(reader.pages)
for x in range(number_of_pages):
page = reader.pages[x]
text = page.extract_text()
pdf_text = pdf_text + text
print(pdf_text)
There is this URL https://www.jpx.co.jp/english/listing/stocks/new/index.html#3422
I wrote(copy&paste from internet!) the following code to save all the pdfs which are inside the table in a folder
from PyPDF2 import PdfFileReader
import requests
from bs4 import BeautifulSoup
import io
import urllib.request as req
import urllib
import os
import time
from urllib.parse import urljoin
url = 'https://www.jpx.co.jp/english/listing/stocks/new/index.html'
headers = {'User-Agent':'Mozilla/5.0'}
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")
result = soup.select("a[href]")
link_list =[]
for link in result:
href = link.get("href")
link_list.append(href)
pdf_list = [temp for temp in link_list if temp.endswith('pdf')]
print(pdf_list)
abs_pdf_list = []
for relative in pdf_list:
temp_url = urljoin(url, relative)
abs_pdf_list.append(temp_url)
filename_list = []
for target in abs_pdf_list:
temp_list = target.split("/")
filename_list.append(temp_list[len(temp_list)-1])
newpath = r'/Users/myfolder/python/IPO'
if not os.path.exists(newpath):
os.makedirs(newpath)
target_dir = "/Users/myfolder/python/IPO/"
savepath_list = []
for filename in filename_list:
savepath_list.append(os.path.join(target_dir, filename))
savepath_list
for (pdflink, savepath) in zip(abs_pdf_list, savepath_list):
print(pdflink)
urllib.request.urlretrieve(pdflink, savepath)
time.sleep(2)
import pdfplumber
import re
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url) as r:
with open(local_filename, 'wb') as f:
f.write(r.content)
return local_filename
ap_url = abs_pdf_list[0]
ap = download_file(ap_url)
with pdfplumber.open(ap) as pdf:
page1 = pdf.pages[0]
page2 = pdf.pages[1]
text = page1.extract_text()
print(text)
Now I need to read those pdfs and extract the below lines,
From page1
line which start with "Information & Communication"
From page2
lines which start with
"Book-building Period"
"Offering Price"
and save them in one Excel or CSV file
Sadly I reached to my coding skill limit and can’t move any further .I convert the pdf to text,but …
Please advice me how to do this
I would recommend installing our new package, pdftextract, that conserves the pdf layout as best as possible to extract text, then using some regex to extract the keywords.
Here's a working code snippet tested on 2 pdf files from your link:
import re
import csv
from pdftextract import XPdf
pdf_files = ['a.pdf', "b.pdf"]
keywords = ["Information & Communication", "Book-building Period", "Offering Price"]
def extract_infos(file:str, keywords:list):
"""extract the text from the pdf file then get the wanted keywords information"""
# extracting the text from pdf while keeping the original layout
pdf = XPdf(file)
txt = pdf.to_text(keep_layout=True)
row = []
# getting the keywords information
for keyword in keywords:
# search for the keyword
pattern = "{} (.+)\r".format(keyword) # extracting the wanted info
regex = re.compile(pattern, flags=re.I| re.M)
m = regex.search(txt)
if m is not None:
m = m.groups()[0].strip(' /\r') # strip unwanted space and characters
row.append(m)
return row
def main(files:list, fname:str, headers:list):
"""extract the wanted info from a bunch of pdf files and save them as csv file"""
with open(fname, "w") as wf:
writer = csv.writer(wf)
writer.writerow(headers)
for i, file in enumerate(files, start=1):
row = extract_infos(file, headers)
writer.writerow(row)
print("[DONE]", "writed {} rows to {}.".format(i, fname))
main(pdf_files, "stocks.csv", keywords)
Okay, so I am working on a manga (japanese comics) downloader. Japanese Comics are available online but you can only read them, if you wish to download them, you have to start saving image files by right clicking blah blah blah...
So, I was working on an alternative manga downloader that will download all the chapters of the manga as specified by you and then convert them to pdf.
I have completed the code for downloading the images and its working quite well, but the problem is in the pdf-conversion part.
here's my code
import requests
import urllib
import glob
from bs4 import BeautifulSoup
import os
from fpdf import FPDF
def download_image(url, path):
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
start_chapter = int(input("Enter Starting Chapter: "))
end_chapter = int(input("Enter Ending Chapter: "))
chapters = range(start_chapter, end_chapter + 1)
chapter_list = []
for chapter in chapters:
chapter_list.append("https://manganelo.com/chapter/read_one_piece_manga_online_free4/chapter_" + str(chapter))
for URL in chapter_list:
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
for i in images:
url = i.attrs["src"]
os.makedirs(url.split('/')[-2], exist_ok=True)
download_image(url, os.path.join(url.split('/')[-2], url.split('/')[-1]))
pdf = FPDF()
imageList = glob.glob("*")
for image in imageList:
pdf.add_page()
pdf.image(image, 10, 10, 200, 300)
pdf.output("One Piece Chapter", "F")
So, any suggestions how i can fix this error:
raise RuntimeError('FPDF error: '+msg) RuntimeError: FPDF error: Unsupported image type: chapter_1_romance_dawn
First of all this is a very nice idea.
The error will occurs because the image list path is wrong.
You are storing the jpgs in the folder (chaptername).
Everything you have to do is give the correct path to FPDF.
I created a set to avoid duplications.
Then i removed the "images" and "icon" folder -> maybe you will use them ?
cchapter = set()
for URL in chapter_list:
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
for i in images:
url = i.attrs["src"]
cchapter.add(url.split('/')[-2])
os.makedirs(url.split('/')[-2], exist_ok=True)
download_image(url, os.path.join(url.split('/')[-2], url.split('/')[-1]))
cchapter.remove('images')
cchapter.remove('icons')
chapterlist = list(cchapter)
print(chapterlist[0])
def sortKeyFunc(s):
return int(os.path.basename(s)[:-4])
for chap in chapterlist:
pdf = FPDF()
imageList = glob.glob(chap + "/*.jpg")
imageList.sort(key=sortKeyFunc)
for image in imageList:
pdf.add_page()
pdf.image(image, 10, 10, 200, 300)
pdf.output(chap + ".pdf", "F")
Finally i added a loop to create a pdf for each single folder...
Then naming the PDF to the chapters name...
You also miss in your ourput the extension (".pdf")...
This will work. :)
EDIT:
glob.glob will return the filelist not in correct order.
Reference: here
It is probably not sorted at all and uses the order at which entries
appear in the filesystem, i.e. the one you get when using ls -U. (At
least on my machine this produces the same order as listing glob
matches).
Therefor you can use the filename (in our case given as a number) as a sortkey.
def sortKeyFunc(s):
return int(os.path.basename(s)[:-4])
then add imageList.sort(key=sortKeyFunc) in the loop.
NOTE: Code is updated.
I am trying to download some images from NASS Case Viewer. An example of a case is
https://www-nass.nhtsa.dot.gov/nass/cds/CaseForm.aspx?xsl=main.xsl&CaseID=149006692
The link to the image viewer for this case is
https://www-nass.nhtsa.dot.gov/nass/cds/GetBinary.aspx?ImageView&ImageID=497001669&Desc=FRONT&Title=Vehicle+1+-+Front&Version=1&Extend=jpg
which may not be viewable, I assume because of the https. However, this is simply the Front second image.
The actual link to the image is (or should be?)
https://www-nass.nhtsa.dot.gov/nass/cds/GetBinary.aspx?Image&ImageID=497001669&CaseID=149006692&Version=1
This will simply download aspx binaries.
My problem is that I do not know how to store these binaries to proper jpg files.
Example of code I've tried is
import requests
test_image = "https://www-nass.nhtsa.dot.gov/nass/cds/GetBinary.aspx?Image&ImageID=497001669&CaseID=149006692&Version=1"
pull_image = requests.get(test_image)
with open("test_image.jpg", "wb+") as myfile:
myfile.write(str.encode(pull_image.text))
But this does not result in a proper jpg file. I've also inspected pull_image.raw.read() and saw that it's empty.
What could be the issue here? Are my URL's improper? I've used Beautifulsoup to put these URLs together and reviewed them by inspecting the HTML code from a few pages.
Am I saving the binaries incorrectly?
.text decodes the response content to string, so your imge file will be corrupted.
Instead you should use .content which holds the binary response content.
import requests
test_image = "https://www-nass.nhtsa.dot.gov/nass/cds/GetBinary.aspx?Image&ImageID=497001669&CaseID=149006692&Version=1"
pull_image = requests.get(test_image)
with open("test_image.jpg", "wb+") as myfile:
myfile.write(pull_image.content)
.raw.read() also returns bytes, but in order to use it you must set the stream parameter to True.
pull_image = requests.get(test_image, stream=True)
with open("test_image.jpg", "wb+") as myfile:
myfile.write(pull_image.raw.read())
I wanted to follow up on #t.m.adam 's answer to provide a complete answer for anyone who is interested in using this data for their own projects.
Here is my code to pull all images for a sample of Case IDs. It's fairly un-clean code, but I think it gives you what you may need to get started.
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
CaseIDs = [149006673, 149006651, 149006672, 149006673, 149006692, 149006693]
url_part1 = 'https://www-nass.nhtsa.dot.gov/nass/cds/'
data = []
with requests.Session() as sesh:
for caseid in tqdm(CaseIDs):
url_full = f"https://www-nass.nhtsa.dot.gov/nass/cds/CaseForm.aspx?ViewText&CaseID={caseid}&xsl=textonly.xsl&websrc=true"
#print(url_full)
source = sesh.get(url_full).text
soup = BeautifulSoup(source, 'lxml')
tr_tags = soup.find_all('tr', style="page-break-after: always")
for tag in tr_tags:
#print(tag)
"""
try:
vehicle = [x for x in tag.text.split('\n') if 'Vehicle' in x][0] ## return the first element
except IndexError:
vehicle = [x for x in tag.text.split('\n') if 'Scene' in x][0] ## return the first element
"""
tag_list = tag.find_all('tr', class_ = 'label')
test = [x.find('td').text for x in tag_list]
#print(test)
img_id, img_type, part_name = test
img_id = img_id.replace(":", "")
img = tag.find('img')
#part_name = img.get('alt').replace(":", "").replace("/", "")
part_name = part_name.replace(":", "").replace("/", "")
image_name = " ".join([img_type, part_name, img_id]) + ".jpg"
url_src = img.get('src')
img_url = url_part1 + url_src
print(img_url)
pull_image = sesh.get(img_url, stream=True)
with open(image_name, "wb+") as myfile:
myfile.write(pull_image.content)