How to open file, convert string and write to new file - python

I am trying to open a text file, remove certain words that have a ] after them, and then write the new contents to a new file. With the following code, new_content contains what I need, and a new file is created, but it's empty. I cannot figure out why. I've tried indenting differently and passing in an encoding type, with no luck. Any help greatly appreciated.
import glob
import os
import nltk, re, pprint
from nltk import word_tokenize, sent_tokenize
import pandas
import string
import collections
path = "/pathtofiles"
for file in glob.glob(os.path.join(path, '*.txt')):
if file.endswith(".txt"):
f = open(file, 'r')
flines = f.readlines()
for line in flines:
content = line.split()
for word in content:
if word.endswith(']'):
content.remove(word)
new_content = ' '.join(content)
f2 = open((file.rsplit( ".", 1 )[ 0 ] ) + "_preprocessed.txt", "w")
f2.write(new_content)
f.close

This should work #firefly. Happy to answer questions if you have them.
import glob
import os
path = "/pathtofiles"
for file in glob.glob(os.path.join(path, '*.txt')):
if file.endswith(".txt"):
with open(file, 'r') as f:
flines = f.readlines()
new_content = []
for line in flines:
content = line.split()
new_content_line = []
for word in content:
if not word.endswith(']'):
new_content_line.append(word)
new_content.append(' '.join(new_content_line))
f2 = open((file.rsplit( ".", 1 )[ 0 ] ) + "_preprocessed.txt", "w")
f2.write('\n'.join(new_content))
f.close
f2.close

Related

Regex search in python for angular project

Searching through one file:
import re
textfile = open("dummyData", 'r')
matches = []
reg = re.compile("\w+(?=\s?\|\s?maximum)")
for line in textfile:
matches += reg.findall(line)
textfile.close()
print(matches)
Questions:
How can I search through whole angular project and have all matches in one array?
Thank you in advance!
Use glob:
import re, glob
matches = []
reg = re.compile(r"\w+(?=\s?\|\s?maximum)")
files = [file for file in glob.glob("*dummyData*")]
for file_name in files:
with open(file_name, 'r') as f:
for line in f:
matches += reg.findall(line)
print(matches)

Defining a list within the code versus reading it from a file

I am trying to count the number of specific words in a given report. Does anyone know why defining a list within the code makes the second part of the following code run faster than reading the list from a file? Is there a solution? The list contains the same words is a lot longer than two words in the following example.
# Example code: Within code list
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = {'agile', 'skills'}
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata
Here is the other way:
# Example code: Reading list from a file
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata
As pointed out by Mark in the comments, the first code snippet uses a set of strings, while the second code snippet loads a file into a list of strings.
Why sets are faster than lists in this use case, is well explained in this Stack Overflow answer. Parsing the output of open to a set can indeed solve your problem.
So replace:
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
With:
create = set(open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines())

How to get number only values from a specific row from different text file

I am trying to get only numbers from a particular row from 10 different text files. As an output, I want those numbers appended as a list. I'm a new learner. I would appreciate your help.
tried this one but not working
import os
import sys,re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
for filename in os.listdir(path):
with open(os.path.join(path, filename), 'r') as f:
#print (filename)
file = open(filename)
all_lines_variable = file.readlines()
sys.stdout = open("output", "a") #print output file
print(filename, all_lines_variable[line_number])
sys.stdout.close()
You can try this script, it will extract from all files line number 69 and then appends it to output.txt file:
import os
import re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
with open('output.txt', 'w') as f_out:
for file in os.listdir(path):
with open(os.path.join(path, file), 'r') as f_in:
lines = f_in.readlines()
print(' '.join(re.findall(r'\d+', lines[line_number])), file=f_out)

How to rename file names using text file in python

I need to rename a bunch of files in a folder with new name reference from a text file. Can you please give an example for this.
My New Names In a Text file:
1BA
1BB
1BC
1BD
1BE
1BF
1C0
1C1
1C2
1C3
Like this.
Updated Code:
import csv
import os
with open('names.txt') as f2:
filedata = f2.read().split(",")
os.rename(filedata[0].strip(), filedata[1].strip())
f2.close()
f2 = open ('Lines.txt','w')
f2.write(filedata)
f2.close()
What about using a CSV (comma separated) file for input in the format oldPath, newPath and do the following:
import csv
import os
with open('names.csv') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
oldPath = row[0]
newPath = row[1]
os.rename(oldPath, newPath)
Alternatively, if you want to move the file to another directory/filesystem you can have a look at shutil.move
# Create old.txt and rename.txt
# Do not include file path inside the txt files
# For each line, write a filename include the extension
from pathlib import Path
import os
import sys
print("enter path of folder")
path = input()
oldList = []
with open(path + "\\old.txt", "r", encoding="utf8", errors='ignore') as readtxt:
for f in readtxt:
fname = f.rstrip()
oldList.append(fname)
# i is index of oldList
i = 0
newList = []
with open(path + "\\rename.txt", "r", encoding="utf8", errors='ignore') as readtxt:
for f in readtxt:
newName = f.rstrip()
os.rename(path + "\\" + oldList[i], path + "\\" + newName)
i += 1

Python: Issue when trying to read and write multiple files

This script reads and writes all the individual html files in a directory. The script reiterates, highlight and write the output.The issue is, after highlighting the last instance of the search item, the script removes all the remaining contents after the last search instance in the output of each file. Any help here is appreciated.
import os
import sys
import re
source = raw_input("Enter the source files path:")
listfiles = os.listdir(source)
for f in listfiles:
filepath = os.path.join(source+'\\'+f)
infile = open(filepath, 'r+')
source_content = infile.read()
color = ('red')
regex = re.compile(r"(\b in \b)|(\b be \b)|(\b by \b)|(\b user \b)|(\bmay\b)|(\bmight\b)|(\bwill\b)|(\b's\b)|(\bdon't\b)|(\bdoesn't\b)|(\bwon't\b)|(\bsupport\b)|(\bcan't\b)|(\bkill\b)|(\betc\b)|(\b NA \b)|(\bfollow\b)|(\bhang\b)|(\bbelow\b)", re.I)
i = 0; output = ""
for m in regex.finditer(source_content):
output += "".join([source_content[i:m.start()],
"<strong><span style='color:%s'>" % color[0:],
source_content[m.start():m.end()],
"</span></strong>"])
i = m.end()
outfile = open(filepath, 'w')
outfile.seek(0, 2)
outfile.write(output)
print "\nProcess Completed!\n"
infile.close()
outfile.close()
raw_input()
After your for loop is over, you need to include whatever is left after the last match:
...
i = m.end()
output += source_content[i:]) # Here's the end of your file
outfile = open(filepath, 'w')
...

Categories