tokenize error in python - python

I have this sample program from python-docx library to extract text from a docx file.
#!/usr/bin/env python
This file opens a docx (Office 2007) file and dumps the text.
If you need to extract text from documents, use this file as a basis for your
Part of Python's docx module -
See LICENSE for licensing information.
import sys
from docx import opendocx, getdocumenttext
if __name__ == '__main__':
document = opendocx(sys.argv[1])
newfile = open(sys.argv[2], 'w')
"Please supply an input and output file. For example:\n"
" 'My Office 2007 document.docx' 'outp"
# Fetch all the text out of the document we just created
paratextlist = getdocumenttext(document)
# Make explicit unicode version
newparatextlist = []
for paratext in paratextlist:
# Print out text of document with two newlines under each paragraph
It runs fine but when I put another program called (given just below) in the same directory.
tokenizer ='tokenizers/punkt/english.pickle')
fo = open(sys.argv[1], "r")
data =
print '\n-----\n'.join(tokenizer.tokenize(data))
It gives following error.
Traceback (most recent call last):
File "./", line 14, in <module>
from docx import opendocx, getdocumenttext
File "/usr/local/lib/python2.7/dist-packages/docx-0.2.1-py2.7.egg/", line 12, in <module>
from lxml import etree
File "parsertarget.pxi", line 4, in init lxml.etree (src/lxml/lxml.etree.c:178742)
File "/usr/lib/python2.7/", line 39, in <module>
import tokenize
File "/home/sriram/NLP_TOOLS/EDITING_TOOL/NLP/sriram_work/", line 3, in <module>
File "/usr/local/lib/python2.7/dist-packages/nltk/", line 106, in <module>
from decorators import decorator, memoize
File "/usr/local/lib/python2.7/dist-packages/nltk/", line 176, in <module>
File "/usr/local/lib/python2.7/dist-packages/nltk/", line 154, in decorator
if inspect.isclass(caller):
AttributeError: 'module' object has no attribute 'isclass'
Please tell me how to resolve this. I want to use both the programs in a single shell script.


Error occuring while using win32 in vs code (in python )

I am trying to convert text to speech in Python 3.10.2 using the code:
import win32com.client
speaker = win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello, it works!")
But there is continuously occuring an error:
Traceback (most recent call last): File "d:\Program\Python programing\", line 1, in <module>
import win32com.client File "C:\Users\ASUS\AppData\Roaming\Python\Python310\site-packages\win32com\client\",
line 10, in <module>
from . import dynamic File "C:\Users\ASUS\AppData\Roaming\Python\Python310\site-packages\win32com\client\",
line 24, in <module>
from . import build File "C:\Users\ASUS\AppData\Roaming\Python\Python310\site-packages\win32com\client\",
line 638, in <module>
valid_identifier_chars = string.ascii_letters + string.digits + "_" AttributeError: module 'string' has no attribute 'ascii_letters'
you wrote two lines of code in the same line
Try this
import win32com.client
speaker = win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello, it works!")
or try to put a semicolon after the first line of code
Try this
import win32com.client; speaker =win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello, it works!")
either way should work

Transform docx to html raises python MemoryError

I have a function that converts a docx to html and a large docx file to be converted.
The problem is this function is part of a bigger program and the converted html is parsed afterwards so I cannot afford to use another converter without impacting the rest of the code (which is not wanted). Running on python 2.7.13 installed on 32-bit, but changing to 64-bit is also not desired.
This is the function:
import logging
from ooxml import serialize
def trasnformDocxtoHtml(inputFile, outputFile):
logging.basicConfig(filename='ooxml.log', level=logging.INFO)
dfile = ooxml.read_from_file(inputFile)
with open(outputFile,'w') as htmlFile:
htmlFile.write( serialize.serialize(dfile.document))
and here's the error:
>>> import library
>>> library.trasnformDocxtoHtml(r'large_file.docx', 'output.html')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "", line 9, in trasnformDocxtoHtml
dfile = ooxml.read_from_file(inputFile)
File "C:\Python27\lib\site-packages\ooxml\", line 52, in read_from_file
File "C:\Python27\lib\site-packages\ooxml\", line 46, in parse
self._doc = parse_from_file(self)
File "C:\Python27\lib\site-packages\ooxml\", line 655, in parse_from_file
document = parse_document(doc_content)
File "C:\Python27\lib\site-packages\ooxml\", line 463, in parse_document
document.elements.append(parse_table(document, elem))
File "C:\Python27\lib\site-packages\ooxml\", line 436, in parse_table
for p in tc.xpath('./w:p', namespaces=NAMESPACES):
File "src\lxml\etree.pyx", line 1583, in lxml.etree._Element.xpath
no mem for new parser
Could I somehow increase the buffer memory in python? Or fix the function without impacting the html output format?

python docx can not find docx file

I actually have the file in my folder:
my code is :
import re
import time
import datetime
import sys
import os
import csv
import docx
from docx import Document
from docx import *
CURRENT_DIR = os.path.dirname(os.path.abspath(sys.argv[0]))
docxFilePath = os.path.join(CURRENT_DIR,'111.docx')
when I run it , it returns me error is:
Traceback (most recent call last): File
"C:\Users\Windows\Desktop\test\fp\", line 20, in <module>
doc=Document(docxFilePath) File "C:\Python27\lib\site-packages\docx\", line 25, in Document
document_part = File "C:\Python27\lib\site-packages\docx\opc\", line 116, in open
pkg_reader = PackageReader.from_file(pkg_file) File "C:\Python27\lib\site-packages\docx\opc\", line 32, in
phys_reader = PhysPkgReader(pkg_file) File "C:\Python27\lib\site-packages\docx\opc\", line 31, in
"Package not found at '%s'" % pkg_file docx.opc.exceptions.PackageNotFoundError: Package not found at
please help
It seems for other file formats as docx, xlsx, and pdfs the file should be in the current working directory. So u can do :
import os
Then see if it works.
I have encountered the same problem and Scanny has answered it correct that file was found but was not a real .docx file.
Don't create it in any other application and rename it to .docx but create a real .docx file.
You can use below to create one using code.
doc = docx.Document()"/path/to/file/where/it/needs/to/save/.docx")

'Search for pattern exhausted' happens when processing WARC file in python3

I'm trying to fetch some plain text from a WARC dataset (yahoo!webscope L2), and keep meeting ValueError: Search for pattern exhausted when using load() function in python3 module warcat. Have tried some random WARC example files and everything worked well.
The dataset did ask for a further license to commit(and then a password would be provide, according to the readme file;do WARC files come with passwords?) but for now I'm not equipped to send a fax.
I also checked out warcat source code, and found that the ValueError would be raised when is False. It seems making no sense to me so I'm asking here...
The code:
>>> import warcat
>>> import warcat.model
>>> warc = warcat.model.WARC()
>>> warc.load('ydata-embedded-metadata-v1_0.warc')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.4/site-packages/warcat/model/", line 32, in load
File "/usr/local/lib/python3.4/site-packages/warcat/model/", line 39, in read_file_object
record, has_more = self.read_record(file_object)
File "/usr/local/lib/python3.4/site-packages/warcat/model/", line 75, in read_record
File "/usr/local/lib/python3.4/site-packages/warcat/model/", line 59, in load
File "/usr/local/lib/python3.4/site-packages/warcat/", line 66, in find_file_pattern
raise ValueError('Search for pattern exhausted')
ValueError: Search for pattern exhausted
Thanks in advance.

How to generate PDF file from an HTML file using Reportlab and Pisa in Python?

Have the following code setup as follows to generate a PDF document using Reportlab and Pisa in Python.
import cStringIO
import ho.pisa as pisa
def html_to_pdf(data, filename, open=False):
pdf = pisa.CreatePDF(
file(filename, "wb"))
My HTML file contains standard HTML content.
It's fully qualified path along with .html extension is assigned to the output_file variable.
Call it like this:
with open(output_file, "r") as my_file:
contents =
html_to_pdf(contents, dest_pdf_file, open=True)
Get this error:
No handlers could be found for logger "sx.pisa3"
Traceback (most recent call last):
File "/home/devuser/myapp/", line 8, in <module>
from utils.fileutils import FileUtil
File "/home/devuser/myapp/utils/", line 5, in <module>
import ho.pisa as pisa
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/ho/pisa/", line 26, in <module>
from sx.pisa3.pisa import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/", line 41, in <module>
from pisa import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/", line 32, in <module>
from pisa_document import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/", line 22, in <module>
from pisa_context import pisaContext
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/", line 21, in <module>
from pisa_util import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/", line 55, in <module>
raise ImportError("Reportlab Version 2.1+ is needed!")
ImportError: Reportlab Version 2.1+ is needed!
This is a "partial list" of what pip freeze yields.
Seems like a broken installation issue...
Does anyone know how to fix this or any alternative methods (approaches and / or different libraries) used to generate HTML files into PDFs?
Got it working... Uninstalled and reinstalled pisa and it worked! :)
sudo easy_install pisa
My code:
import cStringIO
import ho.pisa as pisa
class FileUtil:
def html_to_pdf(html, output_file):
pdfFile = file(output_file, "wb")
pdf = pisa.CreatePDF(
cStringIO.StringIO(html.encode("ISO-8859-1")), pdfFile)
