Extrating text with textract from S3 bucket

Extrating text with textract from S3 bucket - python

I am trying to retrieve a .doc file from a s3 bucket and use textract to read its text. In order to do so, I created these two functions:
def process_files(filepath):
s3 = s3fs.S3FileSystem()
filename = 's3://' + bucket_name + '/' + filepath
_, ext = os.path.splitext(filename)
if ext == '.pdf':
extract_string = pdf_to_string(s3, filename)
return extract_string
elif ext == '.doc':
extract_string = doc_to_string(s3, filename)
return extract_string
def doc_to_string(s3_file, filename):
"""
convert an .doc or .docs file into string
"""
print(filename)
print(s3_file.ls('/myname/test_files/*'))
text = textract.process(filename)
return text
However, I am getting the error:
Is this the right path/to/file/you/want/to/extract.doc
Therefore I changed my code in order to change the path:
def doc_to_string(s3_file, filename):
"""
convert an .doc or .docs file into string
"""
text = textract.process(s3_file.ls('/myname/test_files/*'))
return text
But I get:
Path should be string bytes or os.pathlike

Related

Get the extension of uploaded file in flask

I'm writing a flask API to extract text from the document. I want to check the extension and if it is pdf I'll give it to the pdf miner else docx2txt.
#app.route('/text-extraction', methods = ['POST'])
def text_extractions():
f = request.files['files']
split_tup = os.path.splitext(f)
file_extension = split_tup[1]
if file_extension == '.pdf':
return extract_text(f)
else:
text = docx2txt.process(f)
if extract_text:
return text.replace('\t', ' ')
return None

Searching excel files for string in file through multiple folder directories not working

I have this code where I am trying to search a Directory and Sub Directories for a specified string within .xls and .xlsx files and return the file name for now. When I run this - I get a return of each file directory path as text for the files ending in .xls and .xlsx and the search string parameter I use under those same returned results. The code is not isolating the files with the string - rather, just returning the file path as text for all results and adding my string parameter to search for under that. What could be happening here? and is it possible to pass a list here and copy the discovered files to a folder? That is where I am trying to get with this in the end. Thank you.
import os
import openpyxl
def findFiles(strings, dir, subDirs, fileContent, fileExtensions):
filesInDir = []
foundFiles = []
filesFound = 0
if not subDirs:
for filename in os.listdir(dir):
if os.path.isfile(os.path.join(dir, filename).replace("\\", "/")):
filesInDir.append(os.path.join(dir, filename).replace("\\", "/"))
else:
for root, subdirs, files in os.walk(dir):
for f in files:
if not os.path.isdir(os.path.join(root, f).replace("\\", "/")):
filesInDir.append(os.path.join(root, f).replace("\\", "/"))
print(filesInDir)
if filesInDir:
for file in filesInDir:
print("Current file: "+file)
filename, extension = os.path.splitext(file)
if fileExtensions:
fileText = extension
else:
fileText = os.path.basename(filename).lower()
if fileContent:
fileText += getFileContent(file).lower()
for string in strings:
print(string)
if string in fileText:
foundFiles.append(file)
filesFound += 1
break
return foundFiles
def getFileContent(filename):
if filename.partition(".")[2] in supportedTypes:
if filename.endswith(".xls"):
content = ""
with openpyxl.load_workbook(filename) as pdf:
for x in range(0, len(pdf.pages)):
page = pdf.pages[x]
content = content + page.extract_text()
return content
elif filename.endswith(".xlsx"):
with openpyxl.load_workbook(filename, 'r') as f:
content = ""
lines = f.readlines()
for x in lines:
content = content + x
f.close()
return content
else:
return ""
supportedTypes = [".xls", ".xlsx"]
print(findFiles(strings=["55413354"], dir="C:/Users/User/", subDirs=True, fileContent=True, fileExtensions=False))
Expected output sample - reflects a find for string '55413354` - as in, that string was located in below file name only out of 3 files.
Excel File Name 123
Actual output - Returns everything - no filter is happening, and includes my search string under the file name.
path/Excel File Name 123
55413354
path/Excel File Name 321
55413354
path/Excel File Name 111
55413354

Extract text from PDF files in a directory using PyPDF2

I would like to extract text from pdf's in a directory to text files in another directory ( convert PDF=> .txt) with PyPDF2
I have read the information here : https://automatetheboringstuff.com/chapter13/
But I did not find information on batch converting the files
import PyPDF2
pdfFileObj = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
pageObj.extractText()
I looking for a solution to convert pdf files from a directory and convert them to .txt files with same names in another directory.

You can take a look at the following code
import os
import PyPDF2
PDFS_FOLDER = '/absolute/path/of/your/pdf/folder'
TEXTS_FOLDER = '/absolute/path/of/your/txt/folder/which/is/already/created'
def get_all_pdfs(folder_path):
"""
:param folder_path: absolute folder path of the pdfs
:return: a list with all the absolute path of pdfs
"""
return os.listdir(folder_path)
def create_absolute_path(root_path, file_name):
"""
:param root_path: absolute route path
:param file_name: file name
:return: absolute path of the file name
"""
root_path = root_path + '/' if root_path[-1] != '/' else root_path
return "%s%s" %(root_path, file_name)
def convert_pdf_to_text(pdf_path):
"""
:param pdf_path:
:return: bytearray with all the pages content
"""
pdfFileObj = open(pdf_path, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
number_of_pages = pdfReader.getNumPages()
text_byte_array = bytearray()
for page_number in range(number_of_pages):
current_page = pdfReader.getPage(page_number)
page_content = bytearray(current_page.extractText().encode('utf-8'))
text_byte_array.extend(page_content)
return text_byte_array
def convert_pdf_extension_to_text(pdf_file_name):
"""
:param pdf_file_name: string which contains a pdf file name
:return: string with the filename but with .txt extension instead of .pdf
"""
return "%s.txt" %(pdf_file_name.split('.pdf', 1)[0])
def save_text_to_folder(text, target_path):
"""
:param text: byte or bytearray
:param target_path:
:return:
"""
with open(target_path, 'wb') as f:
f.write(text)
if __name__ == '__main__':
all_pdfs = get_all_pdfs(PDFS_FOLDER)
for pdf_file_name in all_pdfs:
abs_path_pdf = create_absolute_path(PDFS_FOLDER, pdf_file_name)
text = convert_pdf_to_text(abs_path_pdf)
text_path = convert_pdf_extension_to_text(pdf_file_name)
target_text_path = create_absolute_path(TEXTS_FOLDER, text_path)
save_text_to_folder(text, target_text_path)

Upload file to S3 folder using python boto

I am trying to upload files from local directory to S3 folder. I am able to upload files to S3 bucket but I am unable to upload files to folder within S3 bucket.
Could any one help? What am i doing wrong here..
Here is the code:
import os
import sys
import boto3
import fnmatch
import pprint
import re
import hashlib
SOURCE_DIR = '/home/user/Downloads/tracks/'
BUCKET_NAME = 'mybucket'
S3_FOLDER = 'mybucket/folder1/'
client = boto3.client('s3')
s3 = boto3.resource('s3')
def get_md5(filename):
f = open(filename, 'rb')
m = hashlib.md5()
while True:
data = f.read(10240)
if len(data) == 0:
break
m.update(data)
return m.hexdigest()
def get_etag(filebase,filepath):
for item in bucket.objects.all():
keyfile = S3_FOLDER + filebase
if(keyfile == item.key):
md5 = get_md5(filepath)
etag = item.e_tag.strip('"').strip("'")
if etag != md5:
print(filebase + ": " + md5 + " != " + etag)
return(files_to_upload.append(filepath))
else:
return(files_to_upload.append(filepath))
files_to_upload = []
for root, dirnames, filenames in os.walk(SOURCE_DIR):
for filename in filenames:
filepath = os.path.join(root, filename)
get_etag(filename,filepath)
for f in files_to_upload:
client.put_object(Bucket=BUCKET_NAME, Key=f)

Folders don't really exist in S3. You can prefix the file name (object key) with the something that looks like a folder path.
It's not entirely clear to me what your code is doing with the file paths, but your code needs to be changed to something like this:
for f in files_to_upload:
key = "my/s3/folder/name/" + f
client.put_object(Bucket=BUCKET_NAME, Key=key, Body=f)
Note: You weren't passing a Body parameter, so I think your code was just creating empty objects in S3.

python unzip files below the root folder

i would like to unzip all the folders and files of an archive below the root folder, i have archive named abc.zip which gives me files as abc/xyz/ abc/123.jpg abc/xyz1/ , i just want to extract xyz/ , 123.jpg and xyz1/ in the CWD
i use below code to extract a file, but would need help on how to omit the root folder of the list
def unzip_artifact( local_directory, file_path ):
fileName, ext = os.path.splitext( file_path )
if ext == ".zip":
Downloadfile = basename(fileName) + ext
print 'unzipping file ' + Downloadfile
try:
zipfile.ZipFile(file_path).extractall(local_directory)
except zipfile.error, e:
print "Bad zipfile: %s" % (e)
return

You have to use a more complex (and therefore more customizable) way to unzip. Instead of using the 'extractall' method, you must extract each files separately with the 'extract' method. Then you will be able to change the destination directory, omitting archive's sub-directories.
Here is your code with the modification you needed :
def unzip_artifact( local_directory, file_path ):
fileName, ext = os.path.splitext( file_path )
if ext == ".zip":
Downloadfile = fileName + ext
print 'unzipping file ' + Downloadfile
try:
#zipfile.ZipFile(file_path).extractall(local_directory) # Old way
# Open the zip
with zipfile.ZipFile(file_path) as zf:
# For each members of the archive
for member in zf.infolist():
# If it's a directory, continue
if member.filename[-1] == '/': continue
# Else write its content to the root
with open(local_directory+'/'+os.path.basename(member.filename), "w") as outfile:
outfile.write(zf.read(member))
except zipfile.error, e:
print "Bad zipfile: %s" % (e)
return

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extrating text with textract from S3 bucket - python

Related

Get the extension of uploaded file in flask

Searching excel files for string in file through multiple folder directories not working

Extract text from PDF files in a directory using PyPDF2

Upload file to S3 folder using python boto

python unzip files below the root folder

Categories

Resources