How to read from PDF and save as CSV, using Python? - python

There is this URL https://www.jpx.co.jp/english/listing/stocks/new/index.html#3422
I wrote(copy&paste from internet!) the following code to save all the pdfs which are inside the table in a folder
from PyPDF2 import PdfFileReader
import requests
from bs4 import BeautifulSoup
import io
import urllib.request as req
import urllib
import os
import time
from urllib.parse import urljoin
url = 'https://www.jpx.co.jp/english/listing/stocks/new/index.html'
headers = {'User-Agent':'Mozilla/5.0'}
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")
result = soup.select("a[href]")
link_list =[]
for link in result:
href = link.get("href")
link_list.append(href)
pdf_list = [temp for temp in link_list if temp.endswith('pdf')]
print(pdf_list)
abs_pdf_list = []
for relative in pdf_list:
temp_url = urljoin(url, relative)
abs_pdf_list.append(temp_url)
filename_list = []
for target in abs_pdf_list:
temp_list = target.split("/")
filename_list.append(temp_list[len(temp_list)-1])
newpath = r'/Users/myfolder/python/IPO'
if not os.path.exists(newpath):
os.makedirs(newpath)
target_dir = "/Users/myfolder/python/IPO/"
savepath_list = []
for filename in filename_list:
savepath_list.append(os.path.join(target_dir, filename))
savepath_list
for (pdflink, savepath) in zip(abs_pdf_list, savepath_list):
print(pdflink)
urllib.request.urlretrieve(pdflink, savepath)
time.sleep(2)
import pdfplumber
import re
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url) as r:
with open(local_filename, 'wb') as f:
f.write(r.content)
return local_filename
ap_url = abs_pdf_list[0]
ap = download_file(ap_url)
with pdfplumber.open(ap) as pdf:
page1 = pdf.pages[0]
page2 = pdf.pages[1]
text = page1.extract_text()
print(text)
Now I need to read those pdfs and extract the below lines,
From page1
line which start with "Information & Communication"
From page2
lines which start with
"Book-building Period"
"Offering Price"
and save them in one Excel or CSV file
Sadly I reached to my coding skill limit and can’t move any further .I convert the pdf to text,but …
Please advice me how to do this

I would recommend installing our new package, pdftextract, that conserves the pdf layout as best as possible to extract text, then using some regex to extract the keywords.
Here's a working code snippet tested on 2 pdf files from your link:
import re
import csv
from pdftextract import XPdf
pdf_files = ['a.pdf', "b.pdf"]
keywords = ["Information & Communication", "Book-building Period", "Offering Price"]
def extract_infos(file:str, keywords:list):
"""extract the text from the pdf file then get the wanted keywords information"""
# extracting the text from pdf while keeping the original layout
pdf = XPdf(file)
txt = pdf.to_text(keep_layout=True)
row = []
# getting the keywords information
for keyword in keywords:
# search for the keyword
pattern = "{} (.+)\r".format(keyword) # extracting the wanted info
regex = re.compile(pattern, flags=re.I| re.M)
m = regex.search(txt)
if m is not None:
m = m.groups()[0].strip(' /\r') # strip unwanted space and characters
row.append(m)
return row
def main(files:list, fname:str, headers:list):
"""extract the wanted info from a bunch of pdf files and save them as csv file"""
with open(fname, "w") as wf:
writer = csv.writer(wf)
writer.writerow(headers)
for i, file in enumerate(files, start=1):
row = extract_infos(file, headers)
writer.writerow(row)
print("[DONE]", "writed {} rows to {}.".format(i, fname))
main(pdf_files, "stocks.csv", keywords)

Related

Write changing text to new pdf

I'm trying to open abc.pdf and find google.com and replace with input words. text changing but, I can write the new text to output.pdf it stay same with abc.pdf how can i solve this ?
import PyPDF2
import fitz
from PyPDF2 import PdfReader, PdfWriter
import requests
# # Replace with the URL of the PDF you want to download
# pdf_url = input("Enter the URL of the pdf file to download: ")
#
# # Replace with the link you want to replace the original links with
new_link = input("Enter the link you want to replace the original links with: ")
#
# # Download the PDF file
# response = requests.get(pdf_url)
# with open("abc.pdf", "wb") as f:
# f.write(response.content)
with open('abc.pdf', 'rb') as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
# pdf dosyasının tüm sayfalarını oku
for page in range(len(reader.pages)):
text = reader.pages[page].extract_text()
print(text)
# aranacak stringi bul
if "google.com" in text:
# stringi değiştir
text = text.replace("google.com", new_link)
# pdf dosyasını yeniden yaz
print(text)
writer.add_page(reader.pages[page])
with open('output.pdf', 'wb') as output:
writer.write(output)
file.close()
output.close()
I am also try with fitz, when I serach in links it has to be http:// so I couldnt change link like that google.com
import fitz
import requests
# Replace with the URL of the PDF you want to download
pdf_url = input("Enter the URL of the pdf file to download: ")
# Replace with the link you want to replace the original links with
new_link = input("Enter the link you want to replace the original links with: ")
old_link = input("Enter the link you want to replace ")
# Download the PDF file
response = requests.get(pdf_url)
with open("file.pdf", "wb") as f:
f.write(response.content)
# Open the PDF and modify the links
pdf_doc = fitz.open("file.pdf")
for page in pdf_doc:
for link in page.links():
print(link)
if "uri" in link and link["uri"] == old_link:
print("Found one")
link["uri"] = new_link
# Save the modified PDF to the desktop
pdf_doc.save("test2.pdf")
pdf_doc.close()
And another :
import PyPDF2
import fitz
from PyPDF2 import PdfReader, PdfWriter
import requests
# # Replace with the URL of the PDF you want to download
# pdf_url = input("Enter the URL of the pdf file to download: ")
#
# # Replace with the link you want to replace the original links with
new_link = input("Enter the link you want to replace the original links with: ")
#
# # Download the PDF file
# response = requests.get(pdf_url)
# with open("abc.pdf", "wb") as f:
# f.write(response.content)
# Open the original PDF file
# with open('abc.pdf', 'rb') as file:
doc = fitz.open('abc.pdf')
print(doc)
p = fitz.Point(50, 72) # start point of 1st line
for page in doc:
print(page)
text = page.get_text()
text = text.replace("google.com", new_link).encode("utf8")
rc = page.insert_text(p, # bottom-left of 1st char
text, # the text (honors '\n')
fontname="helv", # the default font
fontsize=11, # the default font size
rotate=0, # also available: 90, 180, 270
) # print(text)
# page.set_text(text)
# doc.insert_pdf(text,to_page=0)
doc.save("output.pdf")
doc.close()

Download a PDF from url, edit it an render it in Django

I need to download a PDF form Azure Storage, edit the file (extract an specific page) and render it from a Django view, I have this:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with io.BytesIO() as out:
pdfWriter.write(out)
But I can't achieve to render the pdf from the Django view, I don't want to use open because I had issue in production by doing this.
EDIT1:
This did work for me but NOT in production:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('media/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('media/test.pdf', 'rb'), content_type='application/pdf')
EDIT2:
This works but had to change the /media path for another one, not sure if is the best solution yet:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('test/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('test/test.pdf', 'rb'), content_type='application/pdf')

Python - why the print result is repeated and "write to a text" only has one line

Lovely people! I'm totally new with Python. I tried to scrape several URLs and encountered a problem with "print".
I tried to print and write the "shipment status".
I have two URLs, so ideally I get two results.
This is my code:
from bs4 import BeautifulSoup
import re
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
for p in shipment:
# extract information
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
import sys
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
I have two problems here:
Problem one: I have only two URLs, and when I print the results, every "span" is repeated 4 times (as there are four "span"s).
The result in the "output" is as below:
(I deleted the result example to protect privacy.)
Problem two: I tried to write the "print" to a text file, but only one line appeared in the file:
(I deleted the result example to protect privacy.)
I want to know what is wrong in the code. I want to print 2 url results only.
Your help is really appreciated!
Thank you in advance!
First point is caused by iterating over shipment - Just delete the for loop and correct indent of print():
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
Second issue is caused while you call the writing outside the loop and not in append mode - You will end up with this as your loop:
#open file in append mode
with open('somefile.txt', 'a') as f:
#start iterating your urls
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
#create output text
line = f'{url};Preparation{Preparation.getText()};Sent{Sent.getText()};InTransit{InTransit.getText()};Delivered{Delivered.getText()}'
#print output text
print (line)
#append output text to file
f.write(line+'\n')
And you can delete:
import sys
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
Example of a bit optimized code:
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = "randomfile.txt"
with open('somefile.txt', 'a', encoding='utf-8') as f:
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = list(soup.select_one('#progress').stripped_strings)
line = f"{url},{';'.join([':'.join(x) for x in list(zip(shipment[::2], shipment[1::2]))])}"
print (line)
f.write(line+'\n')
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
There are four spans actuelly, try this
for url in line_in_list:
soup = BeautifulSoup(urlopen(url).read(), 'html')
# parse something special in the file
shipments = soup.find_all("span") # there are four span actually;
sys.stdout.write('Url '+url+'; Preparation'+shipments[0].getText()+'; Sent'+shipments[1].getText()+'; InTransit'+shipments[2].getText()+'; Delivered'+shipments[3].getText())
# change line
sys.stdout.write("\r")
First question
You have two nested loops :
for url in line_in_list:
for p in shipment:
print(...)
The print is nested in the second loop. If you have 4 shipments per url, that will lead to 4 prints per url.
Since you don't use p from for p in shipment you can completely get rid of the second loop and move the print one indentation level left, like this :
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
Second question
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
Without keyword argument, print is writing to sys.stdout, which is by default your terminal output. There's only one print after sys.sdtout = ... so there will only be one line written to the file.
There's another way to print to a file :
with open('demo.txt', 'a') as f:
print('Hello world', file = f)
The keyword with will ensure the file is closed even if an exception is raised.
Both combined
From what I understood, you want to print two lines to the file. Here's a solution :
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = "randomfile.txt"
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), "html")
# parse something special in the file
shipment = soup.find_all("span")
Preparation = shipment[0]
Sent = shipment[1]
InTransit = shipment[2]
Delivered = shipment[3]
with open(file_path, "a") as f:
f.write(
f"{url} ; Preparation {Preparation.getText()}; Sent {Sent.getText()}; InTransit {InTransit.getText()}; Delivered {Delivered.getText()}"
)

How to generate word cloud for a group of URLs?

I have an array called "URL", in which there are several URLs. Now I want to use the crawler to crawl the title and body of each web page, and then store them together in a TXT file, and then generate a word cloud belonging to this group of web pages.
This is the first document(urls.py):
def urlsgetword(url):
from urllib import request
import os
from bs4 import BeautifulSoup
response = request.urlopen(url) # 发出打开网页的请求
content = response.read().decode('utf-8') # 获取网页内容并用utf-8解码
soup = BeautifulSoup(content, 'lxml')
title = soup.title # 得到网页标题
article = soup.find('div', class_='wp_articlecontent') # 得到网页内容
title = title.text # 得到标题文本内容
title = ''.join(title.split()) # 去除空格
article = article.get_text(strip=True) # 得到文档文本内容,strip=True用以去除文本前后空白行
article = ''.join(article.split())
info = title + '\n' + article
if not os.path.exists("F:/python-file/"):
os.mkdir("F:/python-file/")
with open("urls.txt", 'w', encoding='utf-8') as f:
f.write(info)
f.close()
This is the second document(wordcloud.py):
def wcloud():
import matplotlib.pyplot as plt
import wordcloud
import jieba
text = open('F:/python-file/urls.txt').read()
wordlist_after_jieba = jieba.cut(text, cut_all=True)
wl_space_split = " ".join(wordlist_after_jieba)
my_wordcloud = wordcloud.WordCloud().generate(wl_space_split)
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()
Finally, I want the word cloud output in the main file, so I write this:
for u in urls:
urlsgetword(u)
wcloud()
As a result, the program failed. Which file is wrong?

How to add pictures to docx python from URL?

I am having trouble with the Python Docx Library, I have scraped images from a website and I want to add them to docx but I cannot add the images to docx directly, I keep getting an error:
File "C:\Python27\lib\site-packages\docx\image\image.py", line 46, in
from_file
with open(path, 'rb') as f: IOError: [Errno 22] invalid mode ('rb') or filename:
'http://upsats.com/Content/Product/img/Product/Thumb/PCB2x8-.jpg'
This is my code:
import urllib
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
import os
document = Document()
document.add_heading("Megatronics Items Full Search", 0)
FullPage = ['New-Arrivals-2017-6', 'Big-Sales-click-here', 'Arduino-Development-boards',
'Robotics-and-Copters', 'Breakout-Boards', 'RC-Wireless-communication', 'GSM,-GPS,-RFID,-Wifi',
'Advance-Development-boards-and-starter-Kits', 'Sensors-and-IMU', 'Solenoid-valves,-Relays,--Switches',
'Motors,-drivers,-wheels', 'Microcontrollers-and-Educational-items', 'Arduino-Shields',
'Connectivity-Interfaces', 'Power-supplies,-Batteries-and-Chargers', 'Programmers-and-debuggers',
'LCD,-LED,-Cameras', 'Discrete-components-IC', 'Science-Education-and-DIY', 'Consumer-Electronics-and-tools',
'Mechanical-parts', '3D-Printing-and-CNC-machines', 'ATS', 'UPS', 'Internal-Battries-UPS',
'External-Battries-UPS']
urlp1 = "http://www.arduinopak.com/Prd.aspx?Cat_Name="
URL = urlp1 + FullPage[0]
for n in FullPage:
URL = urlp1 + n
page = urllib.urlopen(URL)
bsObj = BeautifulSoup(page, "lxml")
panel = bsObj.findAll("div", {"class": "panel"})
for div in panel:
titleList = div.find('div', attrs={'class': 'panel-heading'})
imageList = div.find('div', attrs={'class': 'pro-image'})
descList = div.find('div', attrs={'class': 'pro-desc'})
r = requests.get("http://upsats.com/", stream=True)
data = r.text
for link in imageList.find_all('img'):
image = link.get("src")
image_name = os.path.split(image)[1]
r2 = requests.get(image)
with open(image_name, "wb") as f:
f.write(r2.content)
print(titleList.get_text(separator=u' '))
print(imageList.get_text(separator=u''))
print(descList.get_text(separator=u' '))
document.add_heading("%s \n" % titleList.get_text(separator=u' '))
document.add_picture(image, width=Inches(1.5))
document.add_paragraph("%s \n" % descList.get_text(separator=u' '))
document.save('megapy.docx')
Not all of it but just the main part. Now, I am having problems copying the pictures that I downloaded, I want to copy it to docx. I do not know how to add the picture. How do I convert it? I think I have to format it but how do I do that?
All I know is the problem lies within this code:
document.add_picture(image, width=Inches(1.0))
How do I make this image show up in docx from the URL? What am I missing?
Update
I did a test with 10 images and I got a docx. When loading many I had an error at one place and I overwrote that by adding a try, except (see below). The resulting megapy.docx got 165 MB big and took about 10 minutes to create.
with open(image_name, "wb") as f:
f.write(r2.content)
To:
image = io.BytesIO(r2.content)
And added:
try:
document.add_picture(image, width=Inches(1.5))
except:
pass
Use io library to create file-like ojects.
Example that works on python2&3:
import requests
import io
from docx import Document
from docx.shared import Inches
url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Usain_Bolt_Rio_100m_final_2016k.jpg/200px-Usain_Bolt_Rio_100m_final_2016k.jpg'
response = requests.get(url, stream=True)
image = io.BytesIO(response.content)
document = Document()
document.add_picture(image, width=Inches(1.25))
document.save('demo.docx')

Categories