I am using following code for the test purpose. However it only reads the first pdf of the directory. My pdf files name is like test , test1 and test2. But it only reads the test. I can also see the list of all pdf in the directory but the read function only reads the first pdf, I am not sure my line of code for +file_name, it should be like that or something else.
I request for help please. Below is my code for reference
#date
from datetime import*
import PyPDF2
import os
import re
today_date = datetime.today()
print('Today is:' , today_date)
#file list
for file_name in os.listdir(r"C:\\Nikhar\Work\Quantum\Work"):
print(file_name)
#read all file in directory
load_pdf = open(r"C:\\Nikhar\\Work\\Quantum\\Work\\"+file_name, "rb")
read_pdf = PyPDF2.PdfFileReader(load_pdf)
page_count = read_pdf.getNumPages()
first_page = read_pdf.getPage(0)
page_content = first_page.extractText()
page_content = page_content.replace('\n', '')
print(page_content)
You have to indent the code to make it executed in each loop like this :
#date
from datetime import*
import PyPDF2
import os
import re
today_date = datetime.today()
print('Today is:' , today_date)
#file list
for file_name in os.listdir(r"C:\\Nikhar\Work\Quantum\Work"):
print(file_name)
#read all file in directory
load_pdf = open(r"C:\\Nikhar\\Work\\Quantum\\Work\\"+file_name, "rb")
read_pdf = PyPDF2.PdfFileReader(load_pdf)
page_count = read_pdf.getNumPages()
first_page = read_pdf.getPage(0)
page_content = first_page.extractText()
page_content = page_content.replace('\n', '')
print(page_content)
You must simply indent the code that should be executed in the for loop:
#date
from datetime import*
import PyPDF2
import os
import re
today_date = datetime.today()
print('Today is:' , today_date)
#file list
for file_name in os.listdir(r"C:\\Nikhar\Work\Quantum\Work"):
print(file_name)
#read all file in directory
load_pdf = open(r"C:\\Nikhar\\Work\\Quantum\\Work\\"+file_name, "rb")
read_pdf = PyPDF2.PdfFileReader(load_pdf)
page_count = read_pdf.getNumPages()
first_page = read_pdf.getPage(0)
page_content = first_page.extractText()
page_content = page_content.replace('\n', '')
print(page_content)
Related
I need to download a PDF form Azure Storage, edit the file (extract an specific page) and render it from a Django view, I have this:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with io.BytesIO() as out:
pdfWriter.write(out)
But I can't achieve to render the pdf from the Django view, I don't want to use open because I had issue in production by doing this.
EDIT1:
This did work for me but NOT in production:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('media/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('media/test.pdf', 'rb'), content_type='application/pdf')
EDIT2:
This works but had to change the /media path for another one, not sure if is the best solution yet:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('test/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('test/test.pdf', 'rb'), content_type='application/pdf')
I am writing a python program that uses scrapy spider to crawl a website and looks for pdf files. Once a pdf file is found, it downloads the pdf file to a folder and opens and reads the pdf to grab info from it (author, title, number of pages and if the document is tagged or not). This data is written in CSV file.
Everything seems to work fine, but when I open the CSV file, there is not nearly as much data in it as I expected. For example, I will crawl a site and find 40 pdf files, but the CSV has entries for only 14 pdf's.
pdfspider.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
import urllib.parse as urlparse
import os.path
import validators
import csv
from .. info import isTagged
from .. get_metadata import get_data
class PdfspiderSpider(CrawlSpider):
name = 'pdfspider'
allowed_domain = input('Enter the domain name of the website to be crawled (domain of https://google.com is "google"): ')
allowed_domains = [allowed_domain]
#need domain to name folder pdfs will be put into
global domain
domain = allowed_domains[0]
global start
start = input('Enter the url of the page you wish to start the crawl on (include http/https): ')
start_urls = [start]
global base_path
base_path = input('Where do you wish to save the folder containing the pdfs?: ')
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
base_url = start
for a in response.xpath('//a[#href]/#href'):
link = a.extract()
if link.endswith('.pdf'):
link = urlparse.urljoin(base_url, link)
yield Request(link, callback=self.save_pdf)
def create_csv(self):
header = ['location', 'title', 'author', '# of pages', 'tagged?']
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, 'x')
writer = csv.writer(f)
writer.writerow(header)
f.close()
def save_pdf(self, response):
url=response.url
if response.status == 200:
save_dir = base_path + '/' + domain
isExist = os.path.exists(save_dir)
if not isExist:
# Create a new directory because it does not exist
os.makedirs(save_dir)
csvFile = domain + '.csv'
csvPath = save_dir + '/' + csvFile
csvPathExist = os.path.exists(csvPath)
if not csvPathExist:
self.create_csv()
file = response.url.split('/')[-1]
full_path = os.path.join(save_dir, file)
with open(full_path, 'wb') as f:
f.write(response.body)
metaData = get_data(full_path)
is_tagged = isTagged(full_path)
row = [url, metaData[0], metaData[1], metaData[2], is_tagged]
self.add_to_csv(row)
else:
print(f"Failed to load pdf: {url}")
def add_to_csv(self,row):
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, "a")
writer = csv.writer(f)
writer.writerow(row)
f.close()
info.py
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.pdfdevice import TagExtractor
from pdfminer3.pdfpage import PDFPage
from io import BytesIO
def isTagged(path, password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
try:
try:
device = TagExtractor(rsrcmgr, retstr, codec='utf-8')
except:
print('Not utf-8.')
try:
device = TagExtractor(rsrcmgr, retstr, codec='ascii')
except:
print('Not ascii.')
except Exception as ex:
print(ex)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 100
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
interpreter.process_page(page)
contents = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
# check if common proprietary Acrobat tags are in the response
tags = ["<b\'Part\'", "</b\'Sect\'", "</b\'Art\'", "<b'Content'", "<b\'Artifact\'"]
for tag in tags:
if tag in contents:
return 'tagged'
break
else:
continue
return 'not tagged'
get_metadata.py
from pdfminer3.pdfparser import PDFParser
from pdfminer3.pdfdocument import PDFDocument
from pdfminer.high_level import extract_pages
def get_data(file):
fp = open(file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
author = doc.info[0]["Author"]
title = doc.info[0]["Title"]
numPages = len(list(extract_pages(file)))
return title, author, numPages
Can anyone see what I am doing wrong? This is my first python program so I am pretty green. I am running python 3.9.7
There is this URL https://www.jpx.co.jp/english/listing/stocks/new/index.html#3422
I wrote(copy&paste from internet!) the following code to save all the pdfs which are inside the table in a folder
from PyPDF2 import PdfFileReader
import requests
from bs4 import BeautifulSoup
import io
import urllib.request as req
import urllib
import os
import time
from urllib.parse import urljoin
url = 'https://www.jpx.co.jp/english/listing/stocks/new/index.html'
headers = {'User-Agent':'Mozilla/5.0'}
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")
result = soup.select("a[href]")
link_list =[]
for link in result:
href = link.get("href")
link_list.append(href)
pdf_list = [temp for temp in link_list if temp.endswith('pdf')]
print(pdf_list)
abs_pdf_list = []
for relative in pdf_list:
temp_url = urljoin(url, relative)
abs_pdf_list.append(temp_url)
filename_list = []
for target in abs_pdf_list:
temp_list = target.split("/")
filename_list.append(temp_list[len(temp_list)-1])
newpath = r'/Users/myfolder/python/IPO'
if not os.path.exists(newpath):
os.makedirs(newpath)
target_dir = "/Users/myfolder/python/IPO/"
savepath_list = []
for filename in filename_list:
savepath_list.append(os.path.join(target_dir, filename))
savepath_list
for (pdflink, savepath) in zip(abs_pdf_list, savepath_list):
print(pdflink)
urllib.request.urlretrieve(pdflink, savepath)
time.sleep(2)
import pdfplumber
import re
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url) as r:
with open(local_filename, 'wb') as f:
f.write(r.content)
return local_filename
ap_url = abs_pdf_list[0]
ap = download_file(ap_url)
with pdfplumber.open(ap) as pdf:
page1 = pdf.pages[0]
page2 = pdf.pages[1]
text = page1.extract_text()
print(text)
Now I need to read those pdfs and extract the below lines,
From page1
line which start with "Information & Communication"
From page2
lines which start with
"Book-building Period"
"Offering Price"
and save them in one Excel or CSV file
Sadly I reached to my coding skill limit and can’t move any further .I convert the pdf to text,but …
Please advice me how to do this
I would recommend installing our new package, pdftextract, that conserves the pdf layout as best as possible to extract text, then using some regex to extract the keywords.
Here's a working code snippet tested on 2 pdf files from your link:
import re
import csv
from pdftextract import XPdf
pdf_files = ['a.pdf', "b.pdf"]
keywords = ["Information & Communication", "Book-building Period", "Offering Price"]
def extract_infos(file:str, keywords:list):
"""extract the text from the pdf file then get the wanted keywords information"""
# extracting the text from pdf while keeping the original layout
pdf = XPdf(file)
txt = pdf.to_text(keep_layout=True)
row = []
# getting the keywords information
for keyword in keywords:
# search for the keyword
pattern = "{} (.+)\r".format(keyword) # extracting the wanted info
regex = re.compile(pattern, flags=re.I| re.M)
m = regex.search(txt)
if m is not None:
m = m.groups()[0].strip(' /\r') # strip unwanted space and characters
row.append(m)
return row
def main(files:list, fname:str, headers:list):
"""extract the wanted info from a bunch of pdf files and save them as csv file"""
with open(fname, "w") as wf:
writer = csv.writer(wf)
writer.writerow(headers)
for i, file in enumerate(files, start=1):
row = extract_infos(file, headers)
writer.writerow(row)
print("[DONE]", "writed {} rows to {}.".format(i, fname))
main(pdf_files, "stocks.csv", keywords)
I have designed the code parsing HTML files:
from bs4 import BeautifulSoup
import re
import os
from os.path import join
for (dirname, dirs, files) in os.walk('.'):
for filename in files:
if filename.endswith('.html'):
thefile = os.path.join(dirname, filename)
with open(thefile, 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
Initialtext = soup.get_text()
MediumText = Initialtext.lower().split()
clean_tokens = [t for t in text2
if re.match(r'[^\W\d]*$', t)]
removementWords = ['here', 'than']
FinalResult = set()
for somewords in range(len(tokensToCheck)):
if tokensToCheck[somewords] not in removementWords:
FinalResult.add(tokensToCheck[somewords])
`
I have struggled in these case:
1) It saves the code in different lists, while I need one list with all results from various files;
2) As a result, I cannot delete the doubles from different files
How can I handle them?
I think I found where you were wrong.
Here's the code I changed a little bit.
from bs4 import BeautifulSoup
import re
import os
from os.path import join
# definition position should be here so that it can collect all results into one.
FinalResult = set()
for (dirname, dirs, files) in os.walk('.'):
for filename in files:
if filename.endswith('.html'):
thefile = os.path.join(dirname, filename)
with open(thefile, 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, 'lxml')
Initialtext = soup.get_text()
MediumText = Initialtext.lower().split()
clean_tokens = [t for t in text2
if re.match(r'[^\W\d]*$', t)]
removementWords = ['here', 'than']
# FinalResult = set() - definition position is wrong
for somewords in range(len(tokensToCheck)):
if tokensToCheck[somewords] not in removementWords:
FinalResult.add(tokensToCheck[somewords])
I am new to python. I am designing a quotes app using python. I am getting the quotes of the day from the brainy quote websites using BeautifulSoup. I would append it to the text file. In here, if the quotes of the day are already added, when I execute the program again, it should skip it. How to make it possible
Here's the code:
from bs4 import BeautifulSoup
import socket
import requests
import subprocess
import datetime
def quotenotify():
timestamp = datetime.datetime.now().strftime("%b %d")
res = requests.get('https://www.brainyquote.com/quote_of_the_day')
soup = BeautifulSoup(res.text, 'lxml')
image_quote = soup.find('img', {'class': 'p-qotd bqPhotoDefault bqPhotoDefaultFw img-responsive delayedPhotoLoad'})
quoteday=image_quote['alt']
text_file = open("quotes.log", "a+")
text_file.write("%s"%timestamp+"\t"+"%s"% quoteday)
text_file.write("\n")
text_file.close()
return
quotenotify()
output in a file:
Mar 29 Where there is a great love, there are always wishes. - Willa Cather
Mar 29 Where there is great love, there are always wishes. - Willa Cather
Continuing from the comments:
from bs4 import BeautifulSoup
import requests
import datetime
def quotenotify():
timestamp = datetime.datetime.now().strftime("%b %d")
res = requests.get('https://www.brainyquote.com/quote_of_the_day')
soup = BeautifulSoup(res.text, 'lxml')
image_quote = soup.find('img', {'class': 'p-qotd bqPhotoDefault bqPhotoDefaultFw img-responsive delayedPhotoLoad'})['alt']
with open("quotes.log", "w+") as f:
if image_quote not in f.read():
f.write("%s"%timestamp+"\t"+"%s"% image_quote + "\n")
quotenotify()
EDIT:
Since using the mode w+ would truncate the file, I'd suggest going with pathlib:
from bs4 import BeautifulSoup
import requests
import datetime
from pathlib import Path
def quotenotify():
timestamp = datetime.datetime.now().strftime("%b %d")
res = requests.get('https://www.brainyquote.com/quote_of_the_day')
soup = BeautifulSoup(res.text, 'lxml')
image_quote = timestamp + "\t" + soup.find('img', {'class': 'p-qotd bqPhotoDefault bqPhotoDefaultFw img-responsive delayedPhotoLoad'})['alt']
with open("quotes3.log", "a+") as f:
contents = [Path("quotes3.log").read_text()]
print(contents)
print(image_quote)
if image_quote not in contents:
f.write("%s" % timestamp + "\t" + "%s" % image_quote + "\n")
quotenotify()
As mentioned by #DirtyBit, you should open the file in read mode first and load the content onto a variable.
You can see in my example below that I load the content onto a variable, then append to the file only if the variable is not inside the text file.
text_file = open('test-file.txt', 'r+')
read_the_file = text_file.read()
text_file.close()
text_file = open('test-file.txt', 'a+')
new_string = 'Smack Alpha learns python'
if new_string not in read_the_file:
text_file.write(new_string + '\n')
text_file.close()