tokenize error in python - python

I have this sample program from python-docx library example-extracttext.py to extract text from a docx file.
#!/usr/bin/env python
"""
This file opens a docx (Office 2007) file and dumps the text.
If you need to extract text from documents, use this file as a basis for your
work.
Part of Python's docx module - http://github.com/mikemaccana/python-docx
See LICENSE for licensing information.
"""
import sys
from docx import opendocx, getdocumenttext
if __name__ == '__main__':
try:
document = opendocx(sys.argv[1])
newfile = open(sys.argv[2], 'w')
except:
print(
"Please supply an input and output file. For example:\n"
" example-extracttext.py 'My Office 2007 document.docx' 'outp"
"utfile.txt'"
)
exit()
# Fetch all the text out of the document we just created
paratextlist = getdocumenttext(document)
# Make explicit unicode version
newparatextlist = []
for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8"))
# Print out text of document with two newlines under each paragraph
newfile.write('\n\n'.join(newparatextlist))
It runs fine but when I put another program called tokenize.py (given just below) in the same directory.
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
fo = open(sys.argv[1], "r")
data = fo.read()
print '\n-----\n'.join(tokenizer.tokenize(data))
It gives following error.
Traceback (most recent call last):
File "./example-extracttext.py", line 14, in <module>
from docx import opendocx, getdocumenttext
File "/usr/local/lib/python2.7/dist-packages/docx-0.2.1-py2.7.egg/docx.py", line 12, in <module>
from lxml import etree
File "parsertarget.pxi", line 4, in init lxml.etree (src/lxml/lxml.etree.c:178742)
File "/usr/lib/python2.7/inspect.py", line 39, in <module>
import tokenize
File "/home/sriram/NLP_TOOLS/EDITING_TOOL/NLP/sriram_work/tokenize.py", line 3, in <module>
import nltk.data
File "/usr/local/lib/python2.7/dist-packages/nltk/__init__.py", line 106, in <module>
from decorators import decorator, memoize
File "/usr/local/lib/python2.7/dist-packages/nltk/decorators.py", line 176, in <module>
#decorator
File "/usr/local/lib/python2.7/dist-packages/nltk/decorators.py", line 154, in decorator
if inspect.isclass(caller):
AttributeError: 'module' object has no attribute 'isclass'
Please tell me how to resolve this. I want to use both the programs in a single shell script.

Related

Error occuring while using win32 in vs code (in python )

I am trying to convert text to speech in Python 3.10.2 using the code:
import win32com.client
speaker = win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello, it works!")
But there is continuously occuring an error:
Traceback (most recent call last): File "d:\Program\Python programing\tempCodeRunnerFile.py", line 1, in <module>
import win32com.client File "C:\Users\ASUS\AppData\Roaming\Python\Python310\site-packages\win32com\client\__init__.py",
line 10, in <module>
from . import dynamic File "C:\Users\ASUS\AppData\Roaming\Python\Python310\site-packages\win32com\client\dynamic.py",
line 24, in <module>
from . import build File "C:\Users\ASUS\AppData\Roaming\Python\Python310\site-packages\win32com\client\build.py",
line 638, in <module>
valid_identifier_chars = string.ascii_letters + string.digits + "_" AttributeError: module 'string' has no attribute 'ascii_letters'
you wrote two lines of code in the same line
Try this
import win32com.client
speaker = win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello, it works!")
or try to put a semicolon after the first line of code
Try this
import win32com.client; speaker =win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello, it works!")
either way should work

Transform docx to html raises python MemoryError

I have a function that converts a docx to html and a large docx file to be converted.
The problem is this function is part of a bigger program and the converted html is parsed afterwards so I cannot afford to use another converter without impacting the rest of the code (which is not wanted). Running on python 2.7.13 installed on 32-bit, but changing to 64-bit is also not desired.
This is the function:
import logging
from ooxml import serialize
def trasnformDocxtoHtml(inputFile, outputFile):
logging.basicConfig(filename='ooxml.log', level=logging.INFO)
dfile = ooxml.read_from_file(inputFile)
with open(outputFile,'w') as htmlFile:
htmlFile.write( serialize.serialize(dfile.document))
and here's the error:
>>> import library
>>> library.trasnformDocxtoHtml(r'large_file.docx', 'output.html')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "library.py", line 9, in trasnformDocxtoHtml
dfile = ooxml.read_from_file(inputFile)
File "C:\Python27\lib\site-packages\ooxml\__init__.py", line 52, in read_from_file
dfile.parse()
File "C:\Python27\lib\site-packages\ooxml\docxfile.py", line 46, in parse
self._doc = parse_from_file(self)
File "C:\Python27\lib\site-packages\ooxml\parse.py", line 655, in parse_from_file
document = parse_document(doc_content)
File "C:\Python27\lib\site-packages\ooxml\parse.py", line 463, in parse_document
document.elements.append(parse_table(document, elem))
File "C:\Python27\lib\site-packages\ooxml\parse.py", line 436, in parse_table
for p in tc.xpath('./w:p', namespaces=NAMESPACES):
File "src\lxml\etree.pyx", line 1583, in lxml.etree._Element.xpath
MemoryError
no mem for new parser
MemoryError
Could I somehow increase the buffer memory in python? Or fix the function without impacting the html output format?

python docx can not find docx file

I actually have the file in my folder:
my code is :
#-*-coding:utf-8-*-
import re
import time
import datetime
import sys
import os
import csv
import docx
from docx import Document
from docx import *
CURRENT_DIR = os.path.dirname(os.path.abspath(sys.argv[0]))
docxFilePath = os.path.join(CURRENT_DIR,'111.docx')
doc=Document(docxFilePath)
when I run it , it returns me error is:
Traceback (most recent call last): File
"C:\Users\Windows\Desktop\test\fp\makereport.py", line 20, in <module>
doc=Document(docxFilePath) File "C:\Python27\lib\site-packages\docx\api.py", line 25, in Document
document_part = Package.open(docx).main_document_part File "C:\Python27\lib\site-packages\docx\opc\package.py", line 116, in open
pkg_reader = PackageReader.from_file(pkg_file) File "C:\Python27\lib\site-packages\docx\opc\pkgreader.py", line 32, in
from_file
phys_reader = PhysPkgReader(pkg_file) File "C:\Python27\lib\site-packages\docx\opc\phys_pkg.py", line 31, in
__new__
"Package not found at '%s'" % pkg_file docx.opc.exceptions.PackageNotFoundError: Package not found at
'C:\Users\Windows\Desktop\test\fp\111.docx'
please help
It seems for other file formats as docx, xlsx, and pdfs the file should be in the current working directory. So u can do :
import os
os.chdir('C://Users/Windows/Desktop/test/fp')
Then see if it works.
I have encountered the same problem and Scanny has answered it correct that file was found but was not a real .docx file.
Don't create it in any other application and rename it to .docx but create a real .docx file.
You can use below to create one using code.
doc = docx.Document()
doc.save("/path/to/file/where/it/needs/to/save/.docx")

'Search for pattern exhausted' happens when processing WARC file in python3

I'm trying to fetch some plain text from a WARC dataset (yahoo!webscope L2), and keep meeting ValueError: Search for pattern exhausted when using load() function in python3 module warcat. Have tried some random WARC example files and everything worked well.
The dataset did ask for a further license to commit(and then a password would be provide, according to the readme file;do WARC files come with passwords?) but for now I'm not equipped to send a fax.
I also checked out warcat source code, and found that the ValueError would be raised when file_obj.read(size) is False. It seems making no sense to me so I'm asking here...
The code:
>>> import warcat
>>> import warcat.model
>>> warc = warcat.model.WARC()
>>> warc.load('ydata-embedded-metadata-v1_0.warc')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.4/site-packages/warcat/model/warc.py", line 32, in load
self.read_file_object(f)
File "/usr/local/lib/python3.4/site-packages/warcat/model/warc.py", line 39, in read_file_object
record, has_more = self.read_record(file_object)
File "/usr/local/lib/python3.4/site-packages/warcat/model/warc.py", line 75, in read_record
check_block_length=check_block_length)
File "/usr/local/lib/python3.4/site-packages/warcat/model/record.py", line 59, in load
inclusive=True)
File "/usr/local/lib/python3.4/site-packages/warcat/util.py", line 66, in find_file_pattern
raise ValueError('Search for pattern exhausted')
ValueError: Search for pattern exhausted
Thanks in advance.

How to generate PDF file from an HTML file using Reportlab and Pisa in Python?

Have the following code setup as follows to generate a PDF document using Reportlab and Pisa in Python.
import cStringIO
import ho.pisa as pisa
def html_to_pdf(data, filename, open=False):
pdf = pisa.CreatePDF(
cStringIO.StringIO(data),
file(filename, "wb"))
My HTML file contains standard HTML content.
It's fully qualified path along with .html extension is assigned to the output_file variable.
Call it like this:
with open(output_file, "r") as my_file:
contents = my_file.read()
html_to_pdf(contents, dest_pdf_file, open=True)
Get this error:
No handlers could be found for logger "sx.pisa3"
Traceback (most recent call last):
File "/home/devuser/myapp/app.py", line 8, in <module>
from utils.fileutils import FileUtil
File "/home/devuser/myapp/utils/fileutils.py", line 5, in <module>
import ho.pisa as pisa
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/ho/pisa/__init__.py", line 26, in <module>
from sx.pisa3.pisa import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/__init__.py", line 41, in <module>
from pisa import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/pisa.py", line 32, in <module>
from pisa_document import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/pisa_document.py", line 22, in <module>
from pisa_context import pisaContext
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/pisa_context.py", line 21, in <module>
from pisa_util import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/pisa_util.py", line 55, in <module>
raise ImportError("Reportlab Version 2.1+ is needed!")
ImportError: Reportlab Version 2.1+ is needed!
This is a "partial list" of what pip freeze yields.
Pillow==2.3.0
PyPDF2==1.25.1
html5lib==0.999
oneconf==0.3.7.14.04.1
pdfkit==0.5.0
pisa==3.0.33
reportlab==3.0
Seems like a broken installation issue...
Does anyone know how to fix this or any alternative methods (approaches and / or different libraries) used to generate HTML files into PDFs?
Got it working... Uninstalled and reinstalled pisa and it worked! :)
sudo easy_install pisa
My code:
import cStringIO
import ho.pisa as pisa
class FileUtil:
#staticmethod
def html_to_pdf(html, output_file):
pdfFile = file(output_file, "wb")
pdf = pisa.CreatePDF(
cStringIO.StringIO(html.encode("ISO-8859-1")), pdfFile)
pdfFile.close()

Categories