I have this sample program from python-docx library example-extracttext.py to extract text from a docx file.
#!/usr/bin/env python
"""
This file opens a docx (Office 2007) file and dumps the text.
If you need to extract text from documents, use this file as a basis for your
work.
Part of Python's docx module - http://github.com/mikemaccana/python-docx
See LICENSE for licensing information.
"""
import sys
from docx import opendocx, getdocumenttext
if __name__ == '__main__':
try:
document = opendocx(sys.argv[1])
newfile = open(sys.argv[2], 'w')
except:
print(
"Please supply an input and output file. For example:\n"
" example-extracttext.py 'My Office 2007 document.docx' 'outp"
"utfile.txt'"
)
exit()
# Fetch all the text out of the document we just created
paratextlist = getdocumenttext(document)
# Make explicit unicode version
newparatextlist = []
for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8"))
# Print out text of document with two newlines under each paragraph
newfile.write('\n\n'.join(newparatextlist))
It runs fine but when I put another program called tokenize.py (given just below) in the same directory.
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
fo = open(sys.argv[1], "r")
data = fo.read()
print '\n-----\n'.join(tokenizer.tokenize(data))
It gives following error.
Traceback (most recent call last):
File "./example-extracttext.py", line 14, in <module>
from docx import opendocx, getdocumenttext
File "/usr/local/lib/python2.7/dist-packages/docx-0.2.1-py2.7.egg/docx.py", line 12, in <module>
from lxml import etree
File "parsertarget.pxi", line 4, in init lxml.etree (src/lxml/lxml.etree.c:178742)
File "/usr/lib/python2.7/inspect.py", line 39, in <module>
import tokenize
File "/home/sriram/NLP_TOOLS/EDITING_TOOL/NLP/sriram_work/tokenize.py", line 3, in <module>
import nltk.data
File "/usr/local/lib/python2.7/dist-packages/nltk/__init__.py", line 106, in <module>
from decorators import decorator, memoize
File "/usr/local/lib/python2.7/dist-packages/nltk/decorators.py", line 176, in <module>
#decorator
File "/usr/local/lib/python2.7/dist-packages/nltk/decorators.py", line 154, in decorator
if inspect.isclass(caller):
AttributeError: 'module' object has no attribute 'isclass'
Please tell me how to resolve this. I want to use both the programs in a single shell script.
Related
I am trying to convert text to speech in Python 3.10.2 using the code:
import win32com.client
speaker = win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello, it works!")
But there is continuously occuring an error:
Traceback (most recent call last): File "d:\Program\Python programing\tempCodeRunnerFile.py", line 1, in <module>
import win32com.client File "C:\Users\ASUS\AppData\Roaming\Python\Python310\site-packages\win32com\client\__init__.py",
line 10, in <module>
from . import dynamic File "C:\Users\ASUS\AppData\Roaming\Python\Python310\site-packages\win32com\client\dynamic.py",
line 24, in <module>
from . import build File "C:\Users\ASUS\AppData\Roaming\Python\Python310\site-packages\win32com\client\build.py",
line 638, in <module>
valid_identifier_chars = string.ascii_letters + string.digits + "_" AttributeError: module 'string' has no attribute 'ascii_letters'
you wrote two lines of code in the same line
Try this
import win32com.client
speaker = win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello, it works!")
or try to put a semicolon after the first line of code
Try this
import win32com.client; speaker =win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello, it works!")
either way should work
I have a function that converts a docx to html and a large docx file to be converted.
The problem is this function is part of a bigger program and the converted html is parsed afterwards so I cannot afford to use another converter without impacting the rest of the code (which is not wanted). Running on python 2.7.13 installed on 32-bit, but changing to 64-bit is also not desired.
This is the function:
import logging
from ooxml import serialize
def trasnformDocxtoHtml(inputFile, outputFile):
logging.basicConfig(filename='ooxml.log', level=logging.INFO)
dfile = ooxml.read_from_file(inputFile)
with open(outputFile,'w') as htmlFile:
htmlFile.write( serialize.serialize(dfile.document))
and here's the error:
>>> import library
>>> library.trasnformDocxtoHtml(r'large_file.docx', 'output.html')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "library.py", line 9, in trasnformDocxtoHtml
dfile = ooxml.read_from_file(inputFile)
File "C:\Python27\lib\site-packages\ooxml\__init__.py", line 52, in read_from_file
dfile.parse()
File "C:\Python27\lib\site-packages\ooxml\docxfile.py", line 46, in parse
self._doc = parse_from_file(self)
File "C:\Python27\lib\site-packages\ooxml\parse.py", line 655, in parse_from_file
document = parse_document(doc_content)
File "C:\Python27\lib\site-packages\ooxml\parse.py", line 463, in parse_document
document.elements.append(parse_table(document, elem))
File "C:\Python27\lib\site-packages\ooxml\parse.py", line 436, in parse_table
for p in tc.xpath('./w:p', namespaces=NAMESPACES):
File "src\lxml\etree.pyx", line 1583, in lxml.etree._Element.xpath
MemoryError
no mem for new parser
MemoryError
Could I somehow increase the buffer memory in python? Or fix the function without impacting the html output format?
I actually have the file in my folder:
my code is :
#-*-coding:utf-8-*-
import re
import time
import datetime
import sys
import os
import csv
import docx
from docx import Document
from docx import *
CURRENT_DIR = os.path.dirname(os.path.abspath(sys.argv[0]))
docxFilePath = os.path.join(CURRENT_DIR,'111.docx')
doc=Document(docxFilePath)
when I run it , it returns me error is:
Traceback (most recent call last): File
"C:\Users\Windows\Desktop\test\fp\makereport.py", line 20, in <module>
doc=Document(docxFilePath) File "C:\Python27\lib\site-packages\docx\api.py", line 25, in Document
document_part = Package.open(docx).main_document_part File "C:\Python27\lib\site-packages\docx\opc\package.py", line 116, in open
pkg_reader = PackageReader.from_file(pkg_file) File "C:\Python27\lib\site-packages\docx\opc\pkgreader.py", line 32, in
from_file
phys_reader = PhysPkgReader(pkg_file) File "C:\Python27\lib\site-packages\docx\opc\phys_pkg.py", line 31, in
__new__
"Package not found at '%s'" % pkg_file docx.opc.exceptions.PackageNotFoundError: Package not found at
'C:\Users\Windows\Desktop\test\fp\111.docx'
please help
It seems for other file formats as docx, xlsx, and pdfs the file should be in the current working directory. So u can do :
import os
os.chdir('C://Users/Windows/Desktop/test/fp')
Then see if it works.
I have encountered the same problem and Scanny has answered it correct that file was found but was not a real .docx file.
Don't create it in any other application and rename it to .docx but create a real .docx file.
You can use below to create one using code.
doc = docx.Document()
doc.save("/path/to/file/where/it/needs/to/save/.docx")
I'm trying to fetch some plain text from a WARC dataset (yahoo!webscope L2), and keep meeting ValueError: Search for pattern exhausted when using load() function in python3 module warcat. Have tried some random WARC example files and everything worked well.
The dataset did ask for a further license to commit(and then a password would be provide, according to the readme file;do WARC files come with passwords?) but for now I'm not equipped to send a fax.
I also checked out warcat source code, and found that the ValueError would be raised when file_obj.read(size) is False. It seems making no sense to me so I'm asking here...
The code:
>>> import warcat
>>> import warcat.model
>>> warc = warcat.model.WARC()
>>> warc.load('ydata-embedded-metadata-v1_0.warc')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.4/site-packages/warcat/model/warc.py", line 32, in load
self.read_file_object(f)
File "/usr/local/lib/python3.4/site-packages/warcat/model/warc.py", line 39, in read_file_object
record, has_more = self.read_record(file_object)
File "/usr/local/lib/python3.4/site-packages/warcat/model/warc.py", line 75, in read_record
check_block_length=check_block_length)
File "/usr/local/lib/python3.4/site-packages/warcat/model/record.py", line 59, in load
inclusive=True)
File "/usr/local/lib/python3.4/site-packages/warcat/util.py", line 66, in find_file_pattern
raise ValueError('Search for pattern exhausted')
ValueError: Search for pattern exhausted
Thanks in advance.
Have the following code setup as follows to generate a PDF document using Reportlab and Pisa in Python.
import cStringIO
import ho.pisa as pisa
def html_to_pdf(data, filename, open=False):
pdf = pisa.CreatePDF(
cStringIO.StringIO(data),
file(filename, "wb"))
My HTML file contains standard HTML content.
It's fully qualified path along with .html extension is assigned to the output_file variable.
Call it like this:
with open(output_file, "r") as my_file:
contents = my_file.read()
html_to_pdf(contents, dest_pdf_file, open=True)
Get this error:
No handlers could be found for logger "sx.pisa3"
Traceback (most recent call last):
File "/home/devuser/myapp/app.py", line 8, in <module>
from utils.fileutils import FileUtil
File "/home/devuser/myapp/utils/fileutils.py", line 5, in <module>
import ho.pisa as pisa
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/ho/pisa/__init__.py", line 26, in <module>
from sx.pisa3.pisa import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/__init__.py", line 41, in <module>
from pisa import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/pisa.py", line 32, in <module>
from pisa_document import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/pisa_document.py", line 22, in <module>
from pisa_context import pisaContext
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/pisa_context.py", line 21, in <module>
from pisa_util import *
File "/usr/local/lib/python2.7/dist-packages/pisa-3.0.33-py2.7.egg/sx/pisa3/pisa_util.py", line 55, in <module>
raise ImportError("Reportlab Version 2.1+ is needed!")
ImportError: Reportlab Version 2.1+ is needed!
This is a "partial list" of what pip freeze yields.
Pillow==2.3.0
PyPDF2==1.25.1
html5lib==0.999
oneconf==0.3.7.14.04.1
pdfkit==0.5.0
pisa==3.0.33
reportlab==3.0
Seems like a broken installation issue...
Does anyone know how to fix this or any alternative methods (approaches and / or different libraries) used to generate HTML files into PDFs?
Got it working... Uninstalled and reinstalled pisa and it worked! :)
sudo easy_install pisa
My code:
import cStringIO
import ho.pisa as pisa
class FileUtil:
#staticmethod
def html_to_pdf(html, output_file):
pdfFile = file(output_file, "wb")
pdf = pisa.CreatePDF(
cStringIO.StringIO(html.encode("ISO-8859-1")), pdfFile)
pdfFile.close()