Python replace string with random digits in multiple files - python

This code reads files from a directory and replaces string "400432" with a random 6 digits generated by python, builds a script and executes.
Each file should have a unique random 6 digits with corresponding script. File 1: 123456 and script 123456.sh, file 2: 775463 and script 775463.sh, etc.
The problem is when multiple files are present the 123456 is written into all files and only 123456.sh is created. I am not sure how to loop this and it's been 3 years since I've used python.
Struggling to finish this up. Thanks for any help!
#!/usr/bin/python
import os
import stat
import fileinput
import glob
import sys
import re
import random
import subprocess
import string
##############################################################
# Generates random numbers and replaces in WSP files
##############################################################
def random_digits(y):
return ''.join(random.choice(string.digits) for x in range(y))
rand = str(random_digits(6))
_replace_re = re.compile("400432")
for dirpath, dirnames, filenames in os.walk("/home/mark/WSP_IN/"):
for file in filenames:
file = os.path.join(dirpath, file)
head, tail = os.path.split(file)
tempfile = file + ".temp"
with open(tempfile, "w") as target:
with open(file) as source:
for line in source:
line = _replace_re.sub(rand, line)
target.write(line)
os.rename(tempfile, file)
##############################################################
# Creates script that builds ship to based on random numbers
##############################################################
s = open('/usr/local/bin/wsp_scripts/' + rand + '.sh', 'wb+')
##############################################################
# Define responses as string arguments rather than text
##############################################################
data = '/usr/local/bin/wsp_scripts/'
ext = '.sh'
space = ' ';
script = 'exec ${B}/pro5 -c${CF}'
pathx = ' -m2048 -q ${P} - '
swfile = str(tail+space)
swship = str(rand)
sscript = str(script)
spathx = str(pathx)
shell = str(data+rand+ext)
##############################################################
# Write static text and arguments into file
##############################################################
s.write('#!/bin/bash' + '\n')
s.write('#termsoa=/soatermsystem' + '\n'),
s.write('PATH=$PATH:/usr/basic:/usr/basic/util' + '\n')
s.write('B=/usr/local/basis/pro5' + '\n')
s.write('P=/usr5/prog/utils/SO/EDIshipbuild' + '\n')
s.write('CF=/usr/local/basis/pro5/config.bbx'+ '\n')
s.write('TERMCAP=${B}/termcap' + '\n')
s.write('export PATH TERM TERMCAP' + '\n')
s.write('umask 0' + '\n')
s.write('cd /usr/basic' + '\n')
s.write(sscript+spathx+swfile+rand)
st = os.stat(shell)
os.chmod(shell, st.st_mode | stat.S_IEXEC)
s.close()
##############################################################
# Import script just created and execute
##############################################################
cmd = '/usr/local/bin/wsp_scripts/' + rand + '.sh'
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
out, err = p.communicate()
result = out.split('\n')
for lin in result:
if not lin.startswith('#'):
print(lin)
s.close()

You define rand and create your .sh file outside of the for loop. If you want to redefine rand for each file move its definition into the inner for loop. Same goes for the .sh file creation.
Which I think would look like this.
def random_digits(y):
return ''.join(random.choice(string.digits) for x in range(y))
_replace_re = re.compile("400432")
for dirpath, dirnames, filenames in os.walk("/home/mark/WSP_IN/"):
for file in filenames:
rand = str(random_digits(6))
file = os.path.join(dirpath, file)
head, tail = os.path.split(file)
tempfile = file + ".temp"
with open(tempfile, "w") as target:
with open(file) as source:
for line in source:
line = _replace_re.sub(rand, line)
target.write(line)
os.rename(tempfile, file)
##############################################################
# Creates script that builds ship to based on random numbers
##############################################################
s = open('/usr/local/bin/wsp_scripts/' + rand + '.sh', 'wb+')

I don't know , if using 6 digits is a requirement or not?
import time
def random_digits(y):
return int(time.time())
It will return the 10 digit timestamp and will always be unique.
Let me know, if it helps!

Related

Split log data by MB size in Python

There is a process that generates log data of size more than 10 mb. I have been instructed to split the data into 10mb chunks maximum and write to text files means if the log size is 25 mb then it should be divided into 3 parts - 10, 10, 5mb and written to 3 text files.
Also the second and third text file names should be like "file..._1", "file..._2". To write the _1 and _2, I am using the code - filename="file" + "_" + np.arange(1, 10, 1) + ".txt" but when it is creating a new file with underscore, it is giving UFuncTypeError.
My code is:
def writelog(self, filename, msgstr):
#writing log to .txt file
filename = "log-" + str(date.today()) + ".txt"
current_date_and_time = str(datetime.now())
logfile = open(filename, 'a')
logfile.write(current_date_and_time + msgstr)
logfile.close()
#checking if the text file is more than 10mb, then create a new file
filelocation = "...location.../log-2021-07-20.txt"
filesize = os.stat(filelocation)
sizeoflog = filesize.st_size / (1024 * 1024)
print('Size of log in MB- ' + str(sizeoflog))
if sizeoflog > 10:
filename = "log-" + str(date.today()) + "_" + np.arange(1, 10, 1) + ".txt"
logfile = open(filename, 'a')
logfile.write(current_date_and_time + msgstr)
logfile.close()
return filename
msgstr is a dictionary that I passed in main.py
So, the summary is:
split the data into 10mb chunks each and write to file
first file name will be like log-today's date.txt, second file name will be log-today's date_1.txt and so on.
each file content should start with current_date_and_time and then the msgstr.
How can I address these problems ? I am a beginner in Python..
Here's my approach. I created 2 simple helper functions, one for the filesize (with a try: except block) and another to find the last logfile with a size under 10MB.
Since they don't care about the class itself, you should use the #staticmethod decorator. Note that you need to change the method calls to both getsize() and find_current_log() as I don't know the class name.
from datetime import datetime
import os
class ClassNameGoesHere:
#staticmethod
def getsize(filename):
try:
return os.stat(filename).st_size / 1048576
except FileNotFoundError:
return 0
#staticmethod
def find_current_log(filename):
base_filename = os.path.basename(filename)
if '_' in base_filename:
counter = int(base_filename.split('_')[1].split('.')[0])
else:
counter = 0
while ClassNameGoesHere.getsize(filename) >= 10:
counter += 1
if '_' in base_filename:
base_filename = f"{base_filename.split('_')[0]}_{counter}.txt"
else:
base_filename = f"{base_filename.split('.')[0]}_{counter}.txt"
filename = f'{os.path.dirname(filename)}{os.sep}{base_filename}'
return filename
def writelog(self, filename, msgstr):
filename = ClassNameGoesHere.find_current_log(filename)
with open(filename, 'a') as outfile:
outfile.write(f'{datetime.now()} | {msgstr}\n')
somelogger = ClassNameGoesHere()
somelogger.writelog('path/to/file/log-2021-07-21.txt', 'this is a test messsage')

Python: Check if files exist and copy only the missing files

I'm a begynder in python and trying to make a script that does the following:
Check number of files, if they exist in the destFile
If they all exist, exit the script (don't do anything)
If some files are missing, copy only the missing files from the srcFile to the destFile
The script that I have made is working, but the issue that I would like your help with is to make my script only copies the file/files missing and not as my script is doing now, which copies from file 1 (test1.txt) to the file missing. Example if test4.txt & test5.txt files are missing in destFile, my script will copy from test1.txt to test5.txt, in stead of only copying the two missing files test4.txt & test5.txt.
import os, shutil
from datetime import datetime
count = 0
error = "ERROR! file is missing! (files have been copied)"
sttime = datetime.now().strftime('%d/%m/%Y - %H:%M:%S - ')
os.chdir("C:\log")
log = "log.txt"
srcFile = [r"C:\srcFile\test1.txt",
r"C:\srcFile\test2.txt",
r"C:\srcFile\test3.txt",
r"C:\srcFile\test4.txt",
r"C:\srcFile\test5.txt"]
destFile = [r"C:\destFile\test1.txt",
r"C:\destFile\test2.txt",
r"C:\destFile\test3.txt",
r"C:\destFile\test4.txt",
r"C:\destFile\test5.txt"]
for file in destFile:
if not os.path.exists(file):
for file_sr in srcFile:
if not os.path.exists(file):
shutil.copy(file_sr, 'C:\destFile')
count +=1
with open(log, 'a') as logfile:
logfile.write(sttime + error + " " + str(count) + " => " + file + '\n')
The problem is that you're iterating over all of the source files whenever you detect a missing destination file: for file_sr in srcFile:. Instead, you can copy just the missing file by keeping track of the position (in the array) of the missing destination file:
for position, file in enumerate(destFile):
if not os.path.exists(file):
file_sr = srcFile[position]
if not os.path.exists(file):
shutil.copy(file_sr, 'C:\destFile')
Using your code, you can do:
import os, shutil
from datetime import datetime
count = 0
error = "ERROR! file is missing! (files have been copied)"
sttime = datetime.now().strftime('%d/%m/%Y - %H:%M:%S - ')
os.chdir("C:\log")
log = "log.txt"
srcFile = [r"C:\srcFile\test1.txt",
r"C:\srcFile\test2.txt",
r"C:\srcFile\test3.txt",
r"C:\srcFile\test4.txt",
r"C:\srcFile\test5.txt"]
destFile = [r"C:\destFile\test1.txt",
r"C:\destFile\test2.txt",
r"C:\destFile\test3.txt",
r"C:\destFile\test4.txt",
r"C:\destFile\test5.txt"]
for file in destFile:
if not os.path.exists(file):
src_file = destFile.replace("destFile","srcFile")
shutil.copy(src_file, file)
count +=1
with open(log, 'a') as logfile:
logfile.write(sttime + error + " " + str(count) + " => " + file + '\n')
Thank you for your help guys. Exactly my problem was that I was iterating over all of the source files whenever I detected a missing destination file. The following logic from mackorone is doing what I was looking for.
for position, file in enumerate(destFile):
if not os.path.exists(file):
file_sr = srcFile[position]
shutil.copy(file_sr, 'C:\destFile')
I have updated the script, so now this script compares two folders, source folder and destination folder. If destination folder is missing files from the source folder, it will be copied. The script is working fine.
import os
import shutil
from datetime import datetime
sttime = datetime.now().strftime('%d/%m/%Y - %H:%M:%S - ')
error = "ERROR! file is missing! (files have been copied)"
des_path = 'C:\des_folder'
sr_path = 'C:\sr_folder'
des_folder = os.listdir(des_path)
sr_folder = os.listdir(sr_path)
count = 0
os.chdir("C:\log")
log = "log.txt"
def compare_folder(folder1,folder2):
files_in_sr_folder = set(sr_folder) - set(des_folder)
return files_in_sr_folder
files_missing = compare_folder(sr_folder,des_folder)
if len(files_missing) != 0:
for file in files_missing:
full_path_files = os.path.join(sr_path,file)
shutil.copy(full_path_files,des_path)
count +=1
with open(log, 'a') as logfile:
logfile.write(sttime + error + " " + str(count) + " => " + file + '\n')
else:
exit

How to prevent shutil.move from overwriting a file if it already exists?

I'm using this Python code in Windows:
shutil.move(documents_dir + "\\" + file_name, documents_dir + "\\backup\\"
+ subdir_name + "\\" + file_name)
When this code is called more times, it overwrites the destination file. I would like to move the file
and if the destination already exists, to rename it
e.g. file_name = foo.pdf
and in backup folder will be foo.pdf, foo(1).pdf, foo(2).pdf etc. or similarly e.g. with dashes
foo-1.pdf, foo-2.pdf etc.
You could just check with os.path.exists() as you're going.
import os
import shutil
file_name = 'test.csv'
documents_dir = r'C:\BR\Test'
subdir_name = 'test'
# using os.path.join() makes your code easier to port to another OS
source = os.path.join(documents_dir, file_name)
dest = os.path.join(documents_dir, 'backup', subdir_name, file_name)
num = 0
# loop until we find a file that doesn't exist
while os.path.exists(dest):
num += 1
# use rfind to find your file extension if there is one
period = file_name.rfind('.')
# this ensures that it will work with files without extensions
if period == -1:
period = len(file_name)
# create our new destination
# we could extract the number and increment it
# but this allows us to fill in the gaps if there are any
# it has the added benefit of avoiding errors
# in file names like this "test(sometext).pdf"
new_file = f'{file_name[:period]}({num}){file_name[period:]}'
dest = os.path.join(documents_dir, 'backup', subdir_name, new_file)
shutil.move(source, dest)
Or since this is probably used in a loop you could just drop it into a function.
import os
import shutil
def get_next_file(file_name, dest_dir):
dest = os.path.join(dest_dir, file_name)
num = 0
while os.path.exists(dest):
num += 1
period = file_name.rfind('.')
if period == -1:
period = len(file_name)
new_file = f'{file_name[:period]}({num}){file_name[period:]}'
dest = os.path.join(dest_dir, new_file)
return dest
file_name = 'test.csv'
documents_dir = r'C:\BR\Test'
subdir_name = 'test'
source = os.path.join(documents_dir, file_name)
dest = get_next_file(file_name, os.path.join(documents_dir, 'backup', subdir_name))
shutil.move(source, dest)

Counting reads and bases from a list of fastq files

I trimmed my Illumina short reads, forward and reverse, by using Trimmomatic. The Trimmomatic's outputs were: paired_1 - unpaired_1, and paired_2 - unpaired_2.fastq.gz files. I want to know how big was the impact of trimming by counting the number of reads and bases of each file in my directory. I had made a script to count the number of bases and reads for each file in my directory; however, I have problems in if __name__=='__main__'. When I do the for loop I don't know the order of the files that will be run, how can I make it to call the files by the order I see from the screen? Additionally, I also need help with correcting the script as I don't get any stdout.
Thank you in advance for your help.
#!/usr/bin/env python
from sys import argv
import os
def get_num_bases(file_content):
total = []
for linenumber, line in enumerate(file_content):
mod=linenumber%4
if mod==0:
ID = line.strip()[1:]
#print(ID)
if mod==1:
seq = line.strip()
counting = 0
counting += seq.count("T")+ seq.count("A") + seq.count("C") + seq.count("G")
total.append(counting)
allbases = sum(total)
print("Number of bases are: " , allbases)
def get_num_reads(file_content):
total_1 = []
for line in file_content:
num_reads = 0
num_reads += content.count(line)
total_1.append(num_reads)
print("Number of reads are: ", sum(total_1)/int(4))
if __name__=='__main__':
path = os.getcwd()
dir_files = os.listdir(path)
list_files = []
for file in dir_files:
if file.endswith("fastq.gz"):
if file not in list_files:
file_content = open(file, "r").readlines()
list_files.append(file)
print("This is the filename: ", file, get_num_bases(file_content), get_num_reads(file_content))

Python - How to stop the loop

I have this where it reads a file called source1.html, source2.html, source3.html, but when it cant find the next file (because it doesnt exist) it gives me a error. there can be an x amount of sourceX.html, so i need something to say if the next sourcex.html file can not be found, stop the loop.
Traceback (most recent call last): File "main.py", line 14, in
file = open(filename, "r") IOError: [Errno 2] No such file or
directory: 'source4.html
how can i stop the script looking for the next source file?
from bs4 import BeautifulSoup
import re
import os.path
n = 1
filename = "source" + str(n) + ".html"
savefile = open('OUTPUT.csv', 'w')
while os.path.isfile(filename):
strjpgs = "Extracted Layers: \n \n"
filename = "source" + str(n) + ".html"
n = n + 1
file = open(filename, "r")
soup = BeautifulSoup(file, "html.parser")
thedata = soup.find("div", class_="cplayer")
strdata = str(thedata)
DoRegEx = re.compile('/([^/]+)\.jpg')
jpgs = DoRegEx.findall(strdata)
strjpgs = strjpgs + "\n".join(jpgs) + "\n \n"
savefile.write(filename + '\n')
savefile.write(strjpgs)
print(filename)
print(strjpgs)
savefile.close()
print "done"
use a try / except and break
while os.path.isfile(filename):
try: # try to do this
# <your code>
except FileNotFoundError: # if this error occurs
break # exit the loop
The reason your code doesn't currently work is you're checking the previous file exists in your while loop. Not the next one. Hence you could also do
while True:
strjpgs = "Extracted Layers: \n \n"
filename = "source" + str(n) + ".html"
if not os.path.isfile(filename):
break
# <rest of your code>
you can try opening file, and break out of while loop once you catch an IOError exception.
from bs4 import BeautifulSoup
import re
import os.path
n = 1
filename = "source" + str(n) + ".html"
savefile = open('OUTPUT.csv', 'w')
while os.path.isfile(filename):
try:
strjpgs = "Extracted Layers: \n \n"
filename = "source" + str(n) + ".html"
n = n + 1
file = open(filename, "r")
except IOError:
print("file not found! breaking out of loop.")
break
soup = BeautifulSoup(file, "html.parser")
thedata = soup.find("div", class_="cplayer")
strdata = str(thedata)
DoRegEx = re.compile('/([^/]+)\.jpg')
jpgs = DoRegEx.findall(strdata)
strjpgs = strjpgs + "\n".join(jpgs) + "\n \n"
savefile.write(filename + '\n')
savefile.write(strjpgs)
print(filename)
print(strjpgs)
savefile.close()
print "done"
I'll suggest you to use os.path.exists() (which returns True/False) and os.path.isfile() both.
Use with statement to open file. It is Pythonic way to open files.
with statement is best preferred among the professional coders.
These are the contents of my current working directory.
H:\RishikeshAgrawani\Projects\Stk\ReadHtmlFiles>dir
Volume in drive H is New Volume
Volume Serial Number is C867-828E
Directory of H:\RishikeshAgrawani\Projects\Stk\ReadHtmlFiles
11/05/2018 16:12 <DIR> .
11/05/2018 16:12 <DIR> ..
11/05/2018 15:54 106 source1.html
11/05/2018 15:54 106 source2.html
11/05/2018 15:54 106 source3.html
11/05/2018 16:12 0 stopReadingIfNot.md
11/05/2018 16:11 521 stopReadingIfNot.py
5 File(s) 839 bytes
2 Dir(s) 196,260,925,440 bytes free
The below Python code shows how will you read files source1.html, source2.html, source.3.html and stop if there is no more files of the form sourceX.html (where X is 1, 2, 3, 4, ... etc.).
Sample code:
import os
n = 1;
html_file_name = 'source%d.html'
# It is necessary to check if sourceX.html is file or directory.
# If it is directory the check it if it exists or not.
# It it exists then perform operation (read/write etc.) on file.
while os.path.isfile(html_file_name % (n)) and os.path.exists(html_file_name % (n)):
print "Reading ", html_file_name % (n)
# The best way (Pythonic way) to open file
# You don't need to bother about closing the file
# It will be taken care by with statement
with open(html_file_name % (n), "r") as file:
# Make sure it works
print html_file_name % (n), " exists\n";
n += 1;
Output:
H:\RishikeshAgrawani\Projects\Stk\ReadHtmlFiles>python stopReadingIfNot.py
Reading source1.html
source1.html exists
Reading source2.html
source2.html exists
Reading source3.html
source3.html exists
So based on the above logic. you can modify your code. It will work.
Thanks.
This appears to be a sequence error. Let's look at a small fragment of your code, specifically lines dealing with filename:
filename = "source" + str(n) + ".html"
while os.path.isfile(filename):
filename = "source" + str(n) + ".html"
n = n + 1
file = open(filename, "r")
You're generating the next filename before you open the file (or really, checking the old filename then opening a new one). It's a little hard to see because you're really updating n while filename holds the previous number, but if we look at them in sequence it pops out:
n = 1
filename = "source1.html" # before loop
while os.path.isfile(filename):
filename = "source1.html" # first time inside loop
n = 2
open(filename)
while os.path.isfile(filename): # second time in loop - still source1
filename = "source2.html"
n = 3
open(filename) # We haven't checked if this file exists!
We can fix this a few ways. One is to move the entire updating, n before filename, to the end of the loop. Another is to let the loop mechanism update n, which is a sight easier (the real fix here is that we only use one filename value in each iteration of the loop):
for n in itertools.count(1):
filename = "source{}.html".format(n)
if not os.path.isfile(filename):
break
file = open(filename, "r")
#...
At the risk of looking rather obscure, we can also express the steps functionally (I'm using six here to avoid a difference between Python 2 and 3; Python 2's map wouldn't finish):
from six.moves import map
from itertools import count, takewhile
numbers = count(1)
filenames = map('source{}.html'.format, numbers)
existingfiles = takewhile(os.path.isfile, filenames)
for filename in existingfiles:
file = open(filename, "r")
#...
Other options include iterating over the numbers alone and using break when isfile returns False, or simply catching the exception when open fails (eliminating the need for isfile entirely).

Categories