Original files are automatically deleted by the process while compiling code - python

I have written a code in python to convert dicom (.dcm) data into a csv file. However, if I run the code for more than once on my database directory, the data is automatically getting lost/deleted. I tried searching in 'recycle bin' but could not find the deleted data. I am not aware of the process of what went wrong with the data.
Is there anything wrong with my code? Any suggestions are highly appreciated.
here is my code:
import xlsxwriter
import os.path
import sys
import dicom
import xlrd
import csv
root = input("Enter Directory Name: ")
#path = os.path.join(root, "targetdirectory")
i=1
for path, subdirs, files in os.walk(root):
for name in files:
os.rename(os.path.join(path, name), os.path.join(path,'MR000'+ str(i)+'.dcm'))
i=i+1
dcm_files = []
for path, dirs, files in os.walk(root):
for names in files:
if names.endswith(".dcm"):
dcm_files.append(os.path.join(path, names))
print (dcm_files)
with open('junk/test_0.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',',
quotechar='|',
quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(["Folder Name","File Name", "PatientName",
"PatientID", "PatientBirthDate","SliceThickness","Rows"])
for dcm_file in dcm_files:
ds = dicom.read_file(dcm_file)
fileName = dcm_file.split("/")
spamwriter.writerow([fileName[1],fileName[2],
ds.get("PatientName", "None"),
ds.get("PatientID", "None"),
ds.get("PatientBirthDate", "None"),
ds.get("SliceThickness", "None"),
ds.get("Rows", "None")])

You have something like the following scenario:
After 1st iteration, you end with the files: MR0001.dcm, MR0002.dcm, MR0003.dcm... In 2nd iteration, there are the following changes:
os.rename('some_file', 'MR0001.dcm')
os.rename('MR0001.dcm', 'MR0002.dcm')
os.rename('MR0002.dcm', 'MR0003.dcm')
os.rename('MR0003.dcm', 'MR0004.dcm')
...
So at the end there is only a file 'MR0004.dcm'.
Add the following line just below renaming:
print( os.path.join(path, name), '-->', os.path.join(path,'MR000'+ str(i)+'.dcm'))
Then you will see, what exactly files are renamed.

Related

Convert a PDF files to TXT files

I need a last touch from an expert !! I want to convert all pdf files in a directory to txt files. I wrote a code to create empty txt files having the same name as pdf files and a code to convert a single pdf to txt but I want to convert all files in the directory. please see the code below:
PS : I Already tried with PDFminer, and every other package and it does not work
import pandas as pd
import os
import PyPDF2
###Create empty txt files Named as pdf files ###########
path = '....\\PDF2Text\\PDF\\'
newpath = '....\\PDF2Text\\Text\\'
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
ext = f.replace('.pdf','.txt')
extpath = ext.replace(path,newpath)
ft= open(extpath ,"w+")
ft.close()
print(extpath)
##Here we Convert a single pdf file to a txt file providing pdf path and empty txt path #####
import PyPDF2
def getPDFFileContentToTXT(pdfFile):
myPDFFile = PyPDF2.PdfFileReader(pdfFile)
with open('....\\PDF2Text\\Text\\blabla.txt', 'w') as pdf_output:
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open('.....\\PDF2Text\\Text\\blabla.txt', 'r') as myPDFContent:
return myPDFContent.read().replace('\n',' ')
pdfFileContent = getPDFFileContentToTXT('.....\\PDF2Text\\PDF\\blabla.pdf')
import pandas as pd
import os
import PyPDF2
#Create empty txt files Named as pdf files
path = 'C:\\PDF2Text\\PDF\\'
newpath = 'C:\\PDF2Text\\Text\\'
# r=root, d=directories, f = files
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
txt = f.replace('.pdf','.txt')
txtpath = txt.replace(path,newpath)
print(f)
ft= open(txtpath ,"w+")
ft.close()
print(txtpath)
Vpath = f.replace('.pdf','')
#print(Vpath)
myPDFFile = PyPDF2.PdfFileReader(f)
with open(txtpath, 'w') as pdf_output: #, encoding="utf-8"
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open(txtpath, 'r') as myPDFContent:
myPDFContent.read().replace('\n',' ')
Have you tried Tika? Just do a pip install tika (also need to have Java 7+ installed on your system) and maybe this is the piece of code you want:
import os
from tika import parser
def read_pdf(pdf_file):
text = parser.from_file(pdf_file)['content']
return text.encode('utf-8')
def pdf_to_txt(folder_with_pdf, dest_folder):
"""
folder_with_pdf: path to your pdf's
dest_folder: path where you want .txt files saved
"""
pdf_files = []
for root, dirs, files in os.walk(folder_with_pdf):
for f in files:
if '.pdf' in f:
pdf_files.append(os.path.join(root, f))
#print(pdf_files)
for file_ in pdf_files:
text_file = os.path.splitext(os.path.basename(file_))[0]+'.txt'
with open(os.path.join(dest_folder,text_file), 'wb') as text_f:
text_f.write(read_pdf(file_))
return None
pdf_to_txt('./pdf_folder', './txt_folder') #you should see .txt files being populated in ./txt_folder
Aside: If pdf files in sub-directories of ./pdf_folder happens to have the same name (but different content) by any chance, then you will lose one (or more) .txt files.

How to search through both zipped and unzipped folders for a specific line

I'm trying to implement a Python script that takes a folder from the user (can be zipped or unzipped), and search through all the files in the folder to output the specific lines that my regular expression matches. My code below works for regular unzipped folders, but I can't figure out how to do the same with zipped folders that are inputted to function. Below are my code, thanks in advance!
def myFunction(folder_name):
path = folder_name
for (path, subdirs, files) in os.walk(path):
files = [f for f in os.listdir(path) if f.endswith('.txt') or f.endswith('.log') or f.endswith('-release') or f.endswith('.out') or f.endswith('messages') or f.endswith('.zip')] # Specify here the format of files you hope to search from (ex: ".txt" or ".log")
files.sort() # file is sorted list
files = [os.path.join(path, name) for name in files] # Joins the path and the name, so the files can be opened and scanned by the open() function
# The following for loop searches all files with the selected format
for filename in files:
#print('start parsing... ' + str(datetime.datetime.now()))
matched_line = []
try:
with open(filename, 'r', encoding = 'utf-8') as f:
f = f.readlines()
except:
with open(filename, 'r') as f:
f = f.readlines()
# print('Finished parsing... ' + str(datetime.datetime.now()))
for line in f:
#0strip out \x00 from read content, in case it's encoded differently
line = line.replace('\x00', '')
RE2 = r'^Version: \d.+\d.+\d.\w\d.+'
RE3 = r'^.+version.(\d+.\d+.\d+.\d+)'
pattern2 = re.compile('('+RE2+'|'+RE3+')', re.IGNORECASE)
for match2 in pattern2.finditer(line):
matched_line.append(line)
print(line)
#Calling the function to use it
myFunction(r"SampleZippedFolder.zip")
The try and except block of my code was my attempt to open the zipped folder and read it. I'm still not very clear with how to open the zipped folder or how it works. Please let me know how I can modify my code to make it work, much appreciated!
One possibility is first determine what object type folder_name is using zipfile and os.isdir() and whichever one succeeds, get the list of files and proceed. Maybe something like this:
import zipfile, os, re
def myFunction(folder_name):
files = None # nothing yet
path = folder_name
if zipfile.is_zipfile(path):
print('ZipFile: {}'.format(path))
f = zipfile.ZipFile(path)
files = f.namelist()
# for name in f.namelist(): # debugging
# print('file: {}'.format(name))
elif os.path.isdir(path):
print('Folder: {}'.format(path))
files = os.listdir(path)
# for name in os.listdir(path): # debugging
# print('file: {}'.format(name))
# should now have a list of files
# proceed processing the files
for filename in files:
...

Copy certain files from one folder to another using python

I am trying to copy only certain files from one folder to another. The filenames are in a attribute table of a shapefile.
I am successful upto writing the filenames into a .csv file and list the column containing the list of the filenames to be transferred. I am stuck after that on how to read those filenames to copy them to another folder. I have read about using Shutil.copy/move but not sure how to use it. Any help is appreciated. Below is my script:
import arcpy
import csv
import os
import sys
import os.path
import shutil
from collections import defaultdict
fc = 'C:\\work_Data\\Export_Output.shp'
CSVFile = 'C:\\wokk_Data\\Export_Output.csv'
src = 'C:\\UC_Training_Areas'
dst = 'C:\\MOSAIC_Files'
fields = [f.name for f in arcpy.ListFields(fc)]
if f.type <> 'Geometry':
for i,f in enumerate(fields):
if f in (['FID', "Area", 'Category', 'SHAPE_Area']):
fields.remove (f)
with open(CSVFile, 'w') as f:
f.write(','.join(fields)+'\n')
with arcpy.da.SearchCursor(fc, fields) as cursor:
for row in cursor:
f.write(','.join([str(r) for r in row])+'\n')
f.close()
columns = defaultdict(list)
with open(CSVFile) as f:
reader = csv.DictReader(f)
for row in reader:
for (k,v) in row.items():
columns[k].append(v)
print(columns['label'])
Given the name of the file
columns['label'] you can use the following to move a file
srcpath = os.path.join(src, columns['label'])
dstpath = os.path.join(dst, columns['label'])
shutil.copyfile(srcpath, dstpath)
Here is the script I used to solve my problem:
import os
import arcpy
import os.path
import shutil
featureclass = "C:\\work_Data\\Export_Output.shp"
src = "C:\\Data\\UC_Training_Areas"
dst = "C:\\Data\\Script"
rows = arcpy.SearchCursor(featureclass)
row = rows.next()
while row:
print row.Label
shutil.move(os.path.join(src,str(row.Label)),dst)
row = rows.next()
Think of it this ways way source and destination
assuming you want to copy file from your picture folder to your image folder located somewhere in your machine destination
X is your machine name
Z is the file name``
import os;
import shutil;
import glob;
source="C:/Users/X/Pictures/test/Z.jpg"
dest="C:/Users/Public/Image"
if os.path.exists(dest):
print("this folder exit in this dir")
else:
dir = os.mkdir(dest)
for file in glob._iglob(os.path.join(source),""):
shutil.copy(file,dest)
print("done")

batch process text to csv using python

I need some help with converting a number of text files to csv files. All my text files are in one folder and I want to convert them to csv files into another folder. The names of individual files should remain the same. Below is the script I got so far...converting an individual file works fine but to work on all the files within a folder is where I am stuck. Any help will be appreciated.
import csv
import os
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder")
txt_files = directory
csv_files = output
try:
for txt_file in txt_files:
in_txt = csv.reader(open(txt_file, "rb"), delimiter = '=')
for csv_file in csv_files:
out_csv = csv.writer(open(csv_file, 'wb'))
out_csv.writerows(in_txt)
except:
print ()
glob.glob() is perfectly suited for the task. Also, use with context manager when working with files:
import csv
import glob
import os
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder:")
txt_files = os.path.join(directory, '*.txt')
for txt_file in glob.glob(txt_files):
with open(txt_file, "rb") as input_file:
in_txt = csv.reader(input_file, delimiter='=')
filename = os.path.splitext(os.path.basename(txt_file))[0] + '.csv'
with open(os.path.join(output, filename), 'wb') as output_file:
out_csv = csv.writer(output_file)
out_csv.writerows(in_txt)

How to read all text file contents in a folder and copy those contents as rows into one excel file

I have 100 txt files in a folder (named as pos). I would like to copy all the file contents and paste them as rows in an excel file. I found some codes from stackoverflow, but they are not working. Please help me.
import xlwt
import os
import glob
wbk = xlwt.Workbook()
sheet = wbk.add_sheet('data')
path= 'C:\tweet\pos'
row = 0
for files in os.walk(path):
... for file in files:
... if fnmatch(file, '*.txt'):
... L = open(os.path.join( file), "r").read()
... sheet.write(row,5,L)
... row += 1
...
wbk.save('read_all_txt_in_folders.xls')
The following program works for me.
Notes:
The '\t' is being interpreted as a tab, not a path delimiter. Try using forward slashes.
It is import fnmatch / fnmatch.fnmatch(pattern, file). glob is not required.
I had to remove the trailing newlines from the strings. In my test case, using L[:-1] was sufficient. You may require a more robust solution.
os.walk() returns a tuple: (directory, subdirectories, files).
I have left my debugging statements in comments in case they help you.
.
import xlwt
import os
import fnmatch
wbk = xlwt.Workbook()
sheet = wbk.add_sheet('data')
row = 0
# sheet.write(1, 1, "Hello")
for (dir, dirs, files) in os.walk('.'):
# print dir
for file in files:
# print " ", file
if fnmatch.fnmatch(file, '*.txt'):
L = open(os.path.join(dir, file), "r").read()
# print " ", L.__repr__()
a = sheet.write(row,5,L[:-1])
# sheet.write(row, 4, "hello")
# print " ", a
row += 1
wbk.save('read_all_txt_in_folders.xls')

Categories