use dcc.Upload with a Word doc in plotly dash - python

I have a number of Word files, transcripts from MS Teams. I have a script that parses them into a pandas df and breaks it down by speaker, text, and time. I would like to use dcc.Upload to upload, parse, store, and extract some insight with an NLP pipeline from these transcripts. The example provided by Plotly, naturally, deals with the cav and xlsx formats. Any ideas on how I would approach this? Any help is much appreciated!
def get_data_from_word(path_to_file):
from docx import Document
# Creating a word file object
doc_object = open(path_to_file, "rb")
# creating word reader object
doc_reader = Document(doc_object)
data = ""
for p in doc_reader.paragraphs:
data += p.text + "\n"
return data
def get_csv(paragraphs):
combined_paragraphs = []
speaker_text = []
for x in range(len(paragraphs)):
try:
speaker = paragraphs[x][1]
next_speaker = paragraphs[x + 1][1]
if speaker == next_speaker:
speaker_text.append(paragraphs[x][2])
# extract sentences
else:
speaker_text.append(paragraphs[x][2])
text = ''.join(speaker_text)
combined_paragraphs.append([speaker, text])
speaker_text = []
except:
pass

Related

How to extract a keyword and its page number from a PDF file using NLP?

In the above PDF file, my code has to extract keywords and Table Names like Table 1, Table 2, Title with Bold Letters like INTRODUCTION, CASE PRESENTATION from all pages from the given PDF.
Wrote a small program to extract texts from the PDF file
punctuations = ['(',')',';',':','[',']',',','^','=','-','!','.','{','}','/','#','^','&']
stop_words = stopwords.words('English')
keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
print(keywords)
and the output I got was as below
From the above output, How to extract keywords like INTRODUCTION, CASE PRESENTATION, Table 1 along with the page number and save them in a output file.
Output Format
INTRODUCTION in Page 1
CASE PRESENTATION in Page 3
Table 1 (Descriptive Statistics) in Page 5
Need help in obtaining output of this format.
Code
def main():
file_name = open("Test1.pdf","rb")
readpdf = PyPDF2.PdfFileReader(file_name)
#Parse thru each page to extract the texts
pdfPages = readpdf.numPages
count=0
text=""
print()
#The while loop will read each page.
while count < pdfPages:
pageObj = readpdf.getPage(count)
count +=1
text += pageObj.extractText()
#This if statement exists to check if the above library returned words. It's done because PyPDF2 cannot read scanned files.
if text != "":
text = text
#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text.
else:
text = textract.process(fileurl, method='tesseract', language='eng')
#PRINT THE TEXT EXTRACTED FROM GIVEN PDF
#print(text)
#The function will break text into individual words
tokens = word_tokenize(text)
#print('TOKENS')
#print(tokens)
#Clean the punctuations not required.
punctuations = ['(',')',';',':','[',']',',','^','=','-','!','.','{','}','/','#','^','&']
stop_words = stopwords.words('English')
keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
print(keywords)
If you want information on which page is some text then you shouldn't add all to one string but you should work with every page separatelly (in for-loop`)
It could be something similar to this. It is code without tesseract which would need method to split PDF to separated pages and works with every page separatelly
pdfPages = readpdf.numPages
# create it before loop
punctuations = ['(',')',';',':','[',']',',','^','=','-','!','.','{','}','/','#','^','&']
stop_words = stopwords.words('English')
#all_pages = []
# work with every page separatelly
for count in range(pdfPages):
pageObj = readpdf.getPage(count)
page_text = pageObj.extractText()
page_tokens = word_tokenize(page_text)
page_keywords = [word for word in page_tokens if not word in stop_words and not word in punctuations]
page_uppercase_words = [word for word in page_keywords if word.isupper()]
#all_pages.append( (count, page_keywords, page_uppercase_words) )
print('page:', count)
print('keywords:', page_keywords)
print('uppercase:', page_uppercase_words)
# TODO: append/save page to file
Issue partially resolved here: https://github.com/konfuzio-ai/document-ai-python-sdk/issues/6#issue-876036328
Check: https://github.com/konfuzio-ai/document-ai-python-sdk
# pip install konfuzio_sdk
# in working directory
# konfuzio_sdk init
from konfuzio_sdk.api import get_document_annotations
document_first_annotation = get_document_annotations(document_id=1111)[0]
page_index = document_first_annotation['bbox']['page_index']
keyword = document_first_annotation['offset_string']
The object Annotation in the Konfuzio SDK allows to access directly to the keyword string but, at the moment, not directly to the page index. This attribute will be added soon.
An example to access the first annotation in the first training document of your project would be:
# pip install konfuzio_sdk
# in working directory
# konfuzio_sdk init
from konfuzio_sdk.data import Project
my_project = Project()
annotations_first_doc = my_project.documents[0].annotations()
first_annotation = annotations_first_doc[0]
keyword = first_annotation.offset_string
import PyPDF2
import pandas
import numpy
import re
import os,sys
import nltk
import fitz
def main():
file_name = open("File1.pdf","rb")
readPDF = PyPDF2.PdfFileReader(file_name)
call_function(file_name,readPDF)
def call_function(fname,readpdf)
pdfPages = readpdf.numPages
for pageno in range(pdfPages):
doc_name = fitz.open(fname.name)
page = word_tokenize(doc_name[pageno].get_text())
page_texts = [word for word in page if not word in stop_words and not word in punctuations]
print('Page Number:',pageno)
print('Page Texts :',page_texts)

How do I make to read from folder and save in another folder in Python

This code works but I have to call all the files one by one, I need to call only the folder where the files are and to save the results in another folder.
I am not figuring out :( Can anybody help me, I'm new in Python. Thank you I appreciate :)
import re
import string
import sys
frequency = {}
sys.stdin = open('C:/Users/Desktop/app/data/sources/books/test.txt', 'r')
sys.stdout =open('C:/Users/Desktop/app/data/fre/news/test.txt', 'w')
text_string = sys.stdin.read()
match_pattern = re.findall(r'([-][\w]+)', text_string)
for word in match_pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for word in frequency_list:
print (word, frequency[word])
Maybe something like this?
import glob
import os
books = glob.glob("C:/Users/Desktop/app/data/sources/books/*.txt")
# now you have a list of all .txt files in that directory.
def writer(text_string, output_file):
"""A function to write out items from an input text string"""
frequency = {}
match_pattern = re.findall(r'([-][\w]+)', text_string)
for word in match_pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for word in frequency_list:
print(word, frequency[word], file=open(output_file, "a"))
# now you have a function that essentially does the procedure you already know works
for book in books:
book_name = os.path.split(book)[-1] # get <filename>.txt from the path
# context manager will close the stream when you're done
with open(book, "r") as file:
text_string = file.read()
output_file = "C:/Users/Desktop/app/data/fre/news/" + book_name
writer(text_string, output_file)
This code will iterate through the .txt files in the directory you were reading from.
I encapsulated your working code in a function (somewhat reformatted for clarity, you can specify where to print to directly from the print function), so as you iterate through the files you can read them in and drop them through the working code.

how to automate the output for (1200 index) in json file with python code

I have the json file in my local path for text to speech
The below code gives output for only one index value. How it can be automated using for loops for 1200 indexes. Also the output file should end with "speech_1.wav", speech_2.wav... upto "speech_1200.wav" depends on the index value
Below is my code: my current code i called up for index 1 and this gives me output "speech_1.wav"
import json
from gtts import gTTS
def open_json(path):
'''return a list of dictionaries
'''
with open('C:/Users/name/EN-63.json', 'r') as file:
return json.load(file)
data = open_json('./data.json')
#print(data)
text = (data['datasets'][1]['transContent'])
print(text)
tts = gTTS(text = text, lang ='ta')
tts.save("C:/users/name/speech_1.wav")
print("text converted Successfully")
Please help me to fix this.
A simple for loop would be
for k, x in enumerate(data['datasets']):
gTTS(text=x['transContent'], lang='ta').save('C:\users\name\speech_{}.wav'.format(k+1))
length = len(data['datasets'])
for i in range(length):
text = data['datasets'][i]['transContent']
print(text)
tts = gTTS(text = text, lang ='ta')
tts.save("C:/users/name/speech_{}.wav".format(str(i+1)))
print("text converted Successfully")
Hope this what do you want!

Get Lines and Paragraphs, not symbols from Google Vision API OCR on PDF

I am attempting to use the now supported PDF/TIFF Document Text Detection from the Google Cloud Vision API. Using their example code I am able to submit a PDF and receive back a JSON object with the extracted text. My issue is that the JSON file that is saved to GCS only contains bounding boxes and text for "symbols", i.e. each character in each word. This makes the JSON object quite unwieldy and very difficult to use. I'd like to be able to get the text and bounding boxes for "LINES", "PARAGRAPHS" and "BLOCKS", but I can't seem to find a way to do it via the AsyncAnnotateFileRequest() method.
The sample code is as follows:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
"""OCR with PDF/TIFF as source files on GCS"""
# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = 'application/pdf'
# How many pages should be grouped into each json output file.
batch_size = 2
client = vision.ImageAnnotatorClient()
feature = vision.types.Feature(
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
input_config = vision.types.InputConfig(
gcs_source=gcs_source, mime_type=mime_type)
gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
output_config = vision.types.OutputConfig(
gcs_destination=gcs_destination, batch_size=batch_size)
async_request = vision.types.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config,
output_config=output_config)
operation = client.async_batch_annotate_files(
requests=[async_request])
print('Waiting for the operation to finish.')
operation.result(timeout=180)
# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
storage_client = storage.Client()
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)
bucket = storage_client.get_bucket(bucket_name=bucket_name)
# List objects with the given prefix.
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files:')
for blob in blob_list:
print(blob.name)
# Process the first output file from GCS.
# Since we specified batch_size=2, the first response contains
# the first two pages of the input file.
output = blob_list[0]
json_string = output.download_as_string()
response = json_format.Parse(
json_string, vision.types.AnnotateFileResponse())
# The actual response for the first page of the input file.
first_page_response = response.responses[0]
annotation = first_page_response.full_text_annotation
# Here we print the full text from the first page.
# The response contains more information:
# annotation/pages/blocks/paragraphs/words/symbols
# including confidence scores and bounding boxes
print(u'Full text:\n{}'.format(
annotation.text))
Unfortunately when using the DOCUMENT_TEXT_DETECTION type, you can only get the full text per-page, or the individual symbols. It's not too difficult to put together the paragraphs and lines from the symbols though, something like this should work (extending from your example):
breaks = vision.enums.TextAnnotation.DetectedBreak.BreakType
paragraphs = []
lines = []
for page in annotation.pages:
for block in page.blocks:
for paragraph in block.paragraphs:
para = ""
line = ""
for word in paragraph.words:
for symbol in word.symbols:
line += symbol.text
if symbol.property.detected_break.type == breaks.SPACE:
line += ' '
if symbol.property.detected_break.type == breaks.EOL_SURE_SPACE:
line += ' '
lines.append(line)
para += line
line = ''
if symbol.property.detected_break.type == breaks.LINE_BREAK:
lines.append(line)
para += line
line = ''
paragraphs.append(para)
print(paragraphs)
print(lines)

Copying .docx and preserving images

I am trying to copy elements of a doc from one doc file to other. The text part is easy, the images is where it gets tricky.
Attaching an image to explain the structure of the doc: Just some text and 1 image.
from docx import Document
import io
doc = Document('/Users/neha/Desktop/testing.docx')
new_doc = Document()
for elem in doc.element.body:
new_doc.element.body.append(elem)
new_doc.save('/Users/neha/Desktop/out.docx')
This gets me the whole structure of the doc in the new_doc but the image is still blank. Image below:
Good thing is I have the blank image in the right place so I thought of getting the byte level data from the previous image and insert it in the new doc. Here is how I extended the above code:
from docx import Document
import io
doc = Document('/Users/neha/Desktop/testing.docx')
new_doc = Document()
for elem in doc.element.body:
new_doc.element.body.append(elem)
im = doc.inline_shapes[0]
blip = im._inline.graphic.graphicData.pic.blipFill.blip
rId = blip.embed
doc_part = doc.part
image_part = doc_part.related_parts[rId]
bytes = image_part._blob #Here I get the byte level data for the image
im2 = new_doc.inline_shapes[0]
blip2 = im2._inline.graphic.graphicData.pic.blipFill.blip
rId2 = blip2.embed
document_part2 = new_doc.part
document_part2.related_parts[rId2]._blob = bytes
new_doc.save('/Users/neha/Desktop/out.docx')
But the image still shows empty in the new_doc. What should I do from here?
I figured out a solution a couple of days back. However the text loses formatting using this way, but the images are correctly placed.
So the idea is, for para in paras for the source doc, if there is text, I write it to dest doc. And if there is an inline image present, I add a unique identifier at that place in the dest doc (refer here to see how these identifiers work, and contexts in docxtpl). These identifiers and docxtpl proved to be particularly useful here. And then using those unique identifiers I create a 'context' (as shown below) which is basically a map mapping the unique identifier to its particular InlineImage, and finally I render this context..
Below is my code (Apologies for the unnecessary indentation, I copied it directly from my text editor, and shift+tab doesn't work here :P)
from docxtpl import DocxTemplate, InlineImage
import Document
import io
import xml.etree.ElementTree as ET
dest = DocxTemplate()
source = Document(source_path)
context = {}
ims = [im for im in source.inline_shapes]
im_addresses = []
im_streams = []
count = 0
for im in ims:
blip = im._inline.graphic.graphicData.pic.blipFill.blip
rId = blip.embed
doc_part = source.part
image_part = doc_part.related_parts[rId]
byte_data = image_part._blob
image_stream = io.BytesIO(byte_data)
im_streams.append(image_stream)
image_name = self.img_path+"img_"+"_"+str(count)+".jpeg"
with open(image_name, "wb") as fh:
fh.write(byte_data)
fh.close()
im_addresses.append(image_name)
count += 1
paras = source.paragraphs
im_idx = 0
for para in paras:
p = dest.add_paragraph()
r = p.add_run()
if(para.text):
r.add_text(para.text)
root = ET.fromstring(para._p.xml)
namespace = {'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
inlines = root.findall('.//wp:inline',namespace)
if(len(inlines) > 0):
uid = "img_"+str(im_idx)
r.add_text("{{ " + uid + " }}")
context[uid] = InlineImage(dest,im_addresses[im_idx])
im_idx += 1
try:
dest.render(context)
except Exception as e:
print(e)
dest.save(dest_path)
PS: If a paragraph has two images, this code will prove to be sub-optimal.. One will have to make some change in the following:
if(len(inlines) > 0):
uid = "img_"+str(im_idx)
r.add_text("{{ " + uid + " }}")
context[uid] = InlineImage(dest,im_addresses[im_idx])
im_idx += 1
Will have to add a for loop inside the if statement as well. Since I didn't need as usually my images were big enough, so they always came in different paragraphs. Just a side note for anyone who may need it..
Cheers!
You could try:
Extracting the images from the first document by unzipping the .docx file (per How can I search a word in a Word 2007 .docx file?)
Save those images to the file system (as foo.png, for instance)
Generate the new .docx file with Python and add the .png file using document.add_picture('foo.png').
This problem is solved by this package https://docxtpl.readthedocs.io/en/latest/

Categories