multiple .doc to .docx file conversion using python

multiple .doc to .docx file conversion using python - python

I want to convert all the .doc files from a particular folder to .docx file.
I tried using the following code,
import subprocess
import os
for filename in os.listdir(os.getcwd()):
if filename.endswith('.doc'):
print filename
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', filename])
But it gives me an error:
OSError: [Errno 2] No such file or directory

Here is a solution that worked for me. The other solutions proposed did not work on my Windows 10 machine using Python 3.
from glob import glob
import re
import os
import win32com.client as win32
from win32com.client import constants
# Create list of paths to .doc files
paths = glob('C:\\path\\to\\doc\\files\\**\\*.doc', recursive=True)
def save_as_docx(path):
# Opening MS Word
word = win32.gencache.EnsureDispatch('Word.Application')
doc = word.Documents.Open(path)
doc.Activate ()
# Rename path with .docx
new_file_abs = os.path.abspath(path)
new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)
# Save and Close
word.ActiveDocument.SaveAs(
new_file_abs, FileFormat=constants.wdFormatXMLDocument
)
doc.Close(False)
for path in paths:
save_as_docx(path)

I prefer to use the glob module for tasks like that. Put this in a file doc2docx.py. To make it executable, set chmod +x. And optionally put that file in your $PATH as well, to make it available "everywhere".
#!/usr/bin/env python
import glob
import subprocess
for doc in glob.iglob("*.doc"):
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', doc])
Though ideally you'd leave the expansion to the shell itself, and call doc2docx.py with the files as arguments, like doc2docx.py *.doc:
#!/usr/bin/env python
import subprocess
import sys
if len(sys.argv) < 2:
sys.stderr.write("SYNOPSIS: %s file1 [file2] ...\n"%sys.argv[0])
for doc in sys.argv[1:]:
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', doc])
As requested by #pyd, to output to a target directory myoutputdir use:
#!/usr/bin/env python
import subprocess
import sys
if len(sys.argv) < 2:
sys.stderr.write("SYNOPSIS: %s file1 [file2] ...\n"%sys.argv[0])
for doc in sys.argv[1:]:
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', '--outdir', 'myoutputdir', doc])

If you don't like to rely on sub-process calls, here is the version with COM client. It is useful if you are targeting windows users without LibreOffice installed.
#!/usr/bin/env python
import glob
import win32com.client
word = win32com.client.Dispatch("Word.Application")
word.visible = 0
for i, doc in enumerate(glob.iglob("*.doc")):
in_file = os.path.abspath(doc)
wb = word.Documents.Open(in_file)
out_file = os.path.abspath("out{}.docx".format(i))
wb.SaveAs2(out_file, FileFormat=16) # file format for docx
wb.Close()
word.Quit()

based on dshefman's code,
import re
import os
import sys
import win32com.client as win32
from win32com.client import constants
# Get path from command line argument
ABS_PATH = sys.argv[1]
def save_as_docx(path):
# Opening MS Word
word = win32.gencache.EnsureDispatch('Word.Application')
doc = word.Documents.Open(path)
doc.Activate ()
# Rename path with .docx
new_file_abs = os.path.abspath(path)
new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)
# Save and Close
word.ActiveDocument.SaveAs(new_file_abs, FileFormat=constants.wdFormatXMLDocument)
doc.Close(False)
def main():
source = ABS_PATH
for root, dirs, filenames in os.walk(source):
for f in filenames:
filename, file_extension = os.path.splitext(f)
if file_extension.lower() == ".doc":
file_conv = os.path.join(root, f)
save_as_docx(file_conv)
print("%s ==> %sx" %(file_conv,f))
if __name__ == "__main__":
main()

Use os.path.join to specify the correct directory.
import os, subprocess
main_dir = os.path.join('/', 'Users', 'username', 'Desktop', 'foldername')
for filename in os.listdir(main_dir):
if filename.endswith('.doc'):
print filename
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', filename])

Related

The os.walk method not work properly after converting to an EXE

I use the os.walk method to get the absolute path of all of the HTML files under the specified path. When I run it as a python script I have no problems, but after converting it to an EXE(Use Pyinstaller), It can't work properly.
Have any matters need attention after converting it to an EXE, or I'm missing something?
import os
import csv
from os import listdir
from os import walk
from os.path import isfile, isdir, join
def get_html_file_path(file_path):
htmlfile = []
for root, dirs, files in walk(file_path):
for f in files:
# print(f)
try:
pass
if f.split('.')[1] == "html":
fullpath = join(root, f)
htmlfile.append(fullpath)
except:
pass
return htmlfile
full_path = os.path.realpath(__file__)
file_path = os.path.dirname(full_path)
aaa = get_html_file_path(file_path)
print(aaa)

You can use sys.executable instead for your purpose of obtaining the base path of the main executable:
import sys
file_path = os.path.dirname(sys.executable)

Prevent zipfile writing directory to archive

I have the below to zip a file. the file zips correctly, but contains the folders within the zip. How can I just zip the file, whilst still being able to show where the file is located?
create_zip_path = "folder1\\folder2\\my_zip.zip"
file_to_add_to_zip = "folder1\\folder2\\my_file.txt"
zip_file(create_zip_path, file_to_add_to_zip)
def zip_file(create_zip_path, file_to_add_to_zip):
import zipfile
try:
import zlib
compression = zipfile.ZIP_DEFLATED
except:
compression = zipfile.ZIP_STORED
modes = { zipfile.ZIP_DEFLATED: 'deflated',
zipfile.ZIP_STORED: 'stored',
}
zf = zipfile.ZipFile(create_zip_path, mode='w')
zf.write(file_to_add_to_zip, compress_type=compression)

You can use the os module to change your working directory. This should work:
import os
print os.getcwd() #Your current working directory
os.chdir(os.getcwd() + '/folder1/folder2/')
print os.getcwd() #Your new wordking dir
create_zip_path = "my_zip.zip"
file_to_add_to_zip = "my_file.txt"
zip_file(create_zip_path, file_to_add_to_zip)

Search files in folder using part of the name and save/copy to different folder using Python

I have 700 files in a single folder. I need to find files that have "h10v03" as part of the name and copy them to a different folder using python.
Heres an example of one of the files: MOD10A1.A2000121.h10v03.005.2007172062725.hdf
I appreciate any help.

Something like this would do the trick.
import os
import shutil
source_dir = "/some/directory/path"
target_dir = "/some/other/directory/path"
part = "h10v03"
files = [file for file in os.listdir(source_dir)
if os.path.isfile(file) and part in file]
for file in files:
shutil.copy2(os.path.join(source_dir, file), target_dir)

Does it need to be python?
A unix shell does that for you quite fine:
cp ./*h10v03* /other/directory/
In python I would suggest you take a look at os.listdir() and shutil.copy()
EDIT:
some untested code:
import os
import shutil
src_dir = "/some/path/"
target_dir = "/some/other/path/"
searchstring = "h10v03"
for f in os.listdir(src_dir):
if searchstring in f and os.path.isfile(os.path.join(src_dir, f)):
shutil.copy2(os.path.join(src_dir, f), target_dir)
print "COPY", f
with the glob module (untested):
import glob
import os
import shutil
for f in glob.glob("/some/path/*2000*h10v03*"):
print f
shutil.copy2(f, os.path.join("/some/target/dir/", os.path.basename(f)))

Firstly, find all the items in that folder with os.listdir. Then you can use the count() method of string to determine if it has your string. Then you can use shutil to copy the file.

Delete files with python through OS shell

Im Tyring to Delete all Files in E:.
with wildcard.
E:\test\*.txt
I would ask rather than test the os.walk.
In windows.

The way you would do this is use the glob module:
import glob
import os
for fl in glob.glob("E:\\test\\*.txt"):
#Do what you want with the file
os.remove(fl)

A slightly verbose writing of another method
import os
dir = "E:\\test"
files = os.listdir(dir)
for file in files:
if file.endswith(".txt"):
os.remove(os.path.join(dir,file))
Or
import os
[os.remove(os.path.join("E:\\test",f)) for f in os.listdir("E:\\test") if f.endswith(".txt")]

You could use popen for this as well if you want to do it in fewer lines
from subprocess import Popen
proc = Popen("del E:\test\*.txt",shell=False)

If you want to delete file with more than one extension then define those extensions in tuple like below
import os
def purge(dir):
files = os.listdir(dir)
ext = ('.txt', '.xml', '.json')
for file in files:
if file.endswith(ext):
print("File -> " + os.path.join(dir,file))
os.remove(os.path.join(dir,file))

Extract files from zip without keeping the structure using python ZipFile?

I try to extract all files from .zip containing subfolders in one folder. I want all the files from subfolders extract in only one folder without keeping the original structure. At the moment, I extract all, move the files to a folder, then remove previous subfolders. The files with same names are overwrited.
Is it possible to do it before writing files?
Here is a structure for example:
my_zip/file1.txt
my_zip/dir1/file2.txt
my_zip/dir1/dir2/file3.txt
my_zip/dir3/file4.txt
At the end I whish this:
my_dir/file1.txt
my_dir/file2.txt
my_dir/file3.txt
my_dir/file4.txt
What can I add to this code ?
import zipfile
my_dir = "D:\\Download\\"
my_zip = "D:\\Download\\my_file.zip"
zip_file = zipfile.ZipFile(my_zip, 'r')
for files in zip_file.namelist():
zip_file.extract(files, my_dir)
zip_file.close()
if I rename files path from zip_file.namelist(), I have this error:
KeyError: "There is no item named 'file2.txt' in the archive"

This opens file handles of members of the zip archive, extracts the filename and copies it to a target file (that's how ZipFile.extract works, without taking care of subdirectories).
import os
import shutil
import zipfile
my_dir = r"D:\Download"
my_zip = r"D:\Download\my_file.zip"
with zipfile.ZipFile(my_zip) as zip_file:
for member in zip_file.namelist():
filename = os.path.basename(member)
# skip directories
if not filename:
continue
# copy file (taken from zipfile's extract)
source = zip_file.open(member)
target = open(os.path.join(my_dir, filename), "wb")
with source, target:
shutil.copyfileobj(source, target)

It is possible to iterate over the ZipFile.infolist(). On the returned ZipInfo objects you can then manipulate the filename to remove the directory part and finally extract it to a specified directory.
import zipfile
import os
my_dir = "D:\\Download\\"
my_zip = "D:\\Download\\my_file.zip"
with zipfile.ZipFile(my_zip) as zip:
for zip_info in zip.infolist():
if zip_info.filename[-1] == '/':
continue
zip_info.filename = os.path.basename(zip_info.filename)
zip.extract(zip_info, my_dir)

Just extract to bytes in memory,compute the filename, and write it there yourself,
instead of letting the library do it - -mostly, just use the "read()" instead of "extract()" method:
Python 3.6+ update(2020) - the same code from the original answer, but using pathlib.Path, which ease file-path manipulation and other operations (like "write_bytes")
from pathlib import Path
import zipfile
import os
my_dir = Path("D:\\Download\\")
my_zip = my_dir / "my_file.zip"
zip_file = zipfile.ZipFile(my_zip, 'r')
for files in zip_file.namelist():
data = zip_file.read(files, my_dir)
myfile_path = my_dir / Path(files.filename).name
myfile_path.write_bytes(data)
zip_file.close()
Original code in answer without pathlib:
import zipfile
import os
my_dir = "D:\\Download\\"
my_zip = "D:\\Download\\my_file.zip"
zip_file = zipfile.ZipFile(my_zip, 'r')
for files in zip_file.namelist():
data = zip_file.read(files, my_dir)
# I am almost shure zip represents directory separator
# char as "/" regardless of OS, but I don't have DOS or Windos here to test it
myfile_path = os.path.join(my_dir, files.split("/")[-1])
myfile = open(myfile_path, "wb")
myfile.write(data)
myfile.close()
zip_file.close()

A similar concept to the solution of Gerhard Götz, but adapted for extracting single files instead of the entire zip:
with ZipFile(zipPath, 'r') as zipObj:
zipInfo = zipObj.getinfo(path_in_zip))
zipInfo.filename = os.path.basename(destination)
zipObj.extract(zipInfo, os.path.dirname(os.path.realpath(destination)))

In case you are getting badZipFile error. you can unzip the archive using 7zip sub process. assuming you have installed the 7zip then use the following code.
import subprocess
my_dir = destFolder #destination folder
my_zip = destFolder + "/" + filename.zip #file you want to extract
ziploc = "C:/Program Files/7-Zip/7z.exe" #location where 7zip is installed
cmd = [ziploc, 'e',my_zip ,'-o'+ my_dir ,'*.txt' ,'-r' ]
#extracting only txt files and from all subdirectories
sp = subprocess.Popen(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

multiple .doc to .docx file conversion using python - python

Related

The os.walk method not work properly after converting to an EXE

Prevent zipfile writing directory to archive

Search files in folder using part of the name and save/copy to different folder using Python

Delete files with python through OS shell

Extract files from zip without keeping the structure using python ZipFile?

Categories

Resources