Related
I have a text file (heavily modified for this example) which has some data that I want to extract and do some calculations with it. However the text file is extremely messy, so I'm trying to clean it up and write it out to new files first.
Here is the .txt file I'm working with: http://textuploader.com/5elql
I am trying to extract the data which is under the titles (called “Important title”). The only possible way to do that is to first locate a string which always occurs in the file, and its called “DATASET” because all the mess above and below the important data will cover an arbitrary number of lines, difficult to remove manually. Once that’s done I want to store the data in separate files so that it is easier to analyse like this:
http://textuploader.com/5elqw
The file names will be concatenated with the title + the date.
Here is what I have tried so far
with open("example.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
OUTPUT
IMPORTANT TITLE 1
DATE: 12/30/2015
IMPORTANT TITLE 2
DATE: 01/03/2016
So it appears my loop manages to locate the lines where the titles inside the file are and print them out. But this is where I run out of steam. I have no idea on how to extract the data under those titles from there onwards. I have tried using file.readlines() but it outputs all the mess that is in between Important Title 1 and Important Title 2.
Any advice on how I can read all the data under the titles and output them into separate files? Thanks for your time.
You could use regex.
import re
pattern = r"(\s+X\s+Y\s*)|(\s*\d+\s+\d+\s*)"
prog = re.compile(pattern)
with open("example.txt") as file:
cur_filename = ''
content = ""
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:]
elif line.startswith("DATE:"):
fileDate = line[6:]
cur_filename = (fileTitle.strip() + fileDate.strip()).replace('/', '-')
print(cur_filename)
content_title = fileTitle + line
elif prog.match(line):
content += line
elif cur_filename and content:
with open(cur_filename, 'w') as fp:
fp.write(content_title)
fp.write(content)
cur_filename = ''
content = ''
I don't know exactly how you want to store your data but assuming you want a dictionary you could use regex to check if the incoming line matched the pattern, then because fileTitle isn't global you could use that as the key and add the values. I also added rstrip('\r\n') to remove the newline characters after fileTitle.
import re
#if you don't want to store the X and Y, just use re.compile('\d\s+\d+')
p = re.compile('(\d\s+\d+)|(X\s+Y)')
data={}
with open("input.txt") as file:
for line in file:
if line.startswith('DATASET:'):
fileTitle = line[9:].rstrip('\r\n')
if line.startswith("DATE:"):
fileDate = line[:]
print(fileTitle+fileDate)
if p.match(line):
if fileTitle not in data:
data[fileTitle]=[]
line=line.rstrip('\r\n')
data[fileTitle].append(line.split('\t'))
if len(data[fileTitle][len(data[fileTitle])-1]) == 3:
data[fileTitle][len(data[fileTitle])-1].pop()
print data
Yet another regex solution:
sep = '*************************\n'
pattern = r'DATASET[^%]*'
good_stuff = re.compile(pattern)
pattern = r'^DATASET: (.*?)$'
title = re.compile(pattern, flags = re.MULTILINE)
pattern = r'^DATE: (.*?)$'
date = re.compile(pattern, flags = re.MULTILINE)
with open(r'foo.txt') as f:
data = f.read()
for match in good_stuff.finditer(data):
data = match.group()
important_title = title.search(data).group(1)
important_date = date.search(data).group(1)
important_date = important_date.replace(r'/', '-')
fname = important_title + important_date + '.txt'
print(sep, fname)
print(data)
##with open(fname, 'w') as f:
## f.write(data)
I have a function that checks if the text is in file.txt or not.
The function works like this: If the text is contained in the file, the file is closed. If the text is not contained in the file, it is added.
But it doesn't work.
import urllib2, re
from bs4 import BeautifulSoup as BS
def SaveToFile(fileToSave, textToSave):
datafile = file(fileToSave)
for line in datafile:
if textToSave in line:
datafile.close()
else:
datafile.write(textToSave + '\n')
datafile.close()
urls = ['url1', 'url2'] # i dont want to public the links.
patGetTitle = re.compile(r'<title>(.*)</title>')
for url in urls:
u = urllib2.urlopen(url)
webpage = u.read()
title = re.findall(patGetTitle, webpage)
SaveToFile('articles.txt', title)
# so here. If the title of the website is already in articles.txt
# the function should close the file.
# But if the title is not found in articles.txt the function should add it.
You can change the SaveToFile function like this
Your title is a list and not a string so you should call it like this SaveToFile('articles.txt', title[0]) to get the first element of the list
def SaveToFile(fileToSave, textToSave):
with open(fileToSave, "r+") as datafile:
for line in datafile:
if textToSave in line:
break
else:
datafile.write(textToSave + '\n')
Notes:
Since you very looping over an empty file the loop did not even run once.
i.e.)
for i in []:
print i # This will print nothing since it is iterating over empty list same as yours
You have passed a list and not a string since re.findall returns a list object you have to pass the first element of the list to the function.
I have used for..else here if the loop is not terminated properly the else case will work.
i.e.)
for i in []:
print i
else:
print "Nooooo"
Output:
Nooooo
Just use r+ mode like this:
def SaveToFile(fileToSave, textToSave):
with open(fileToSave, 'r+') as datafile:
if textToSave not in datafile.read():
datafile.write(textToSave + '\n')
About that file mode, from this answer:
``r+'' Open for reading and writing. The stream is positioned at the
beginning of the file.
And re.find_all() always return a list, so if you're trying to write a list instead of string you'll get an error.
So you could use:
def SaveToFile(fileToSave, textToSave):
if len(textToSave) => 1:
textToSave = textToSave[0]
else:
return
with open(fileToSave, 'r+') as datafile:
if textToSave not in datafile.read():
datafile.write(textToSave + '\n')
You should refactor your SaveToFile function to like this.
def SaveToFile(fileToSave, titleList):
with open(fileToSave, 'a+') as f:
data = f.read()
for titleText in titleList:
if titleText not in data:
f.write(titleText + '\n')
f.close()
This function read a content of file (if exist or created if not) and checks whether textToSave is in the file contents. If it found textToSave then, close file otherwise write content to file.
This seems closer to your problem.
This checks if the text in the file:
def is_text_in_file(file_name, text):
with open(file_name) as fobj:
for line in fobj:
if text in line:
return True
return False
This use the function above to check and writes the text to end of the file if it is not in file yet.
def save_to_file(file_name, text):
if not is_text_in_file in (file_name, text):
with open(file_name, 'a') as fobj:
fobj.write(text + '\n')
What I am doing is, removing all parts of speech except nouns from a text.
I have written a function for that. It may not be the best or optimized code to do that because I have just started coding in python. I am sure the bug must be very basic but I am just not able to figure it out.
In my function two inputs go as parameters. One is the location of text on hard drive and other is the location of file where we want the output.
Following is the code.
def extract_nouns(i_location, o_location):
import nltk
with open(i_location, "r") as myfile:
data = myfile.read().replace('\n', '')
tokens = nltk.word_tokenize(data)
tagged = nltk.pos_tag(tokens)
length = len(tagged)
a = list()
for i in range(0,length):
print(i)
log = (tagged[i][1][0] == 'N')
if log == False:
a.append(tagged[i][0])
fin = open(i_location, 'r')
fout = open(o_location, "w+")
for line in fin:
for word in a:
line = line.replace(word, "")
fout.write(line)
with open(o_location, "r") as myfile_new:
data_out = myfile_new.read().replace('\n', '')
return data_out
When I call this function it is working just fine. I am getting the output on hard disk as I had intended but it does not return the output on the interface or should I say, it is returning a blank string instead of the actual output string.
This is how I am calling it.
t = extract_nouns("input.txt","output.txt")
If you want to try it, take following as the content of input file
"At eight o'clock on
Thursday film morning word line test
best beautiful Ram Aaron design"
This is the output I am getting in the output file (output.txt) when I call the function but the function returns blank string on the interface instead. It does not even print the output.
"
Thursday film morning word line test
Ram Aar design"
You need to close the file first:
for line in fin:
for word in a:
line = line.replace(word, "")
fout.write(line)
fout.close()
Using with is usually the best way to open files as it automatically closes them and file.seek() to go back to the start of the file to read :
def extract_nouns(i_location, o_location):
import nltk
with open(i_location, "r") as myfile:
data = myfile.read().replace('\n', '')
tokens = nltk.word_tokenize(data)
tagged = nltk.pos_tag(tokens)
length = len(tagged)
a = []
for i in range(0,length):
print(i)
log = (tagged[i][1][0] == 'N')
if not log:
a.append(tagged[i][0])
with open(i_location, 'r') as fin, open(o_location, "w+") as fout:
for line in fin:
for word in a:
line = line.replace(word, "")
fout.write(line)
fout.seek(0) # go back to start of file
data_out = fout.read().replace('\n' , '')
return data_out
The last statement in the function should be the return.
Because there is the print data_out, you return the return value of print which is none.
E.g:
In []: def test():
..: print 'Hello!'
..:
In []: res = test()
Hello!
In []: res is None
Out[]: True
How do I search and replace text in a file using Python 3?
Here is my code:
import os
import sys
import fileinput
print ("Text to search for:")
textToSearch = input( "> " )
print ("Text to replace it with:")
textToReplace = input( "> " )
print ("File to perform Search-Replace on:")
fileToSearch = input( "> " )
#fileToSearch = 'D:\dummy1.txt'
tempFile = open( fileToSearch, 'r+' )
for line in fileinput.input( fileToSearch ):
if textToSearch in line :
print('Match Found')
else:
print('Match Not Found!!')
tempFile.write( line.replace( textToSearch, textToReplace ) )
tempFile.close()
input( '\n\n Press Enter to exit...' )
Input file:
hi this is abcd hi this is abcd
This is dummy text file.
This is how search and replace works abcd
When I search and replace 'ram' by 'abcd' in above input file, it works as a charm. But when I do it vice-versa i.e. replacing 'abcd' by 'ram', some junk characters are left at the end.
Replacing 'abcd' by 'ram'
hi this is ram hi this is ram
This is dummy text file.
This is how search and replace works rambcd
As pointed out by michaelb958, you cannot replace in place with data of a different length because this will put the rest of the sections out of place. I disagree with the other posters suggesting you read from one file and write to another. Instead, I would read the file into memory, fix the data up, and then write it out to the same file in a separate step.
# Read in the file
with open('file.txt', 'r') as file :
filedata = file.read()
# Replace the target string
filedata = filedata.replace('abcd', 'ram')
# Write the file out again
with open('file.txt', 'w') as file:
file.write(filedata)
Unless you've got a massive file to work with which is too big to load into memory in one go, or you are concerned about potential data loss if the process is interrupted during the second step in which you write data to the file.
fileinput already supports inplace editing. It redirects stdout to the file in this case:
#!/usr/bin/env python3
import fileinput
with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace(text_to_search, replacement_text), end='')
As Jack Aidley had posted and J.F. Sebastian pointed out, this code will not work:
# Read in the file
filedata = None
with file = open('file.txt', 'r') :
filedata = file.read()
# Replace the target string
filedata.replace('ram', 'abcd')
# Write the file out again
with file = open('file.txt', 'w') :
file.write(filedata)`
But this code WILL work (I've tested it):
f = open(filein,'r')
filedata = f.read()
f.close()
newdata = filedata.replace("old data","new data")
f = open(fileout,'w')
f.write(newdata)
f.close()
Using this method, filein and fileout can be the same file, because Python 3.3 will overwrite the file upon opening for write.
You can do the replacement like this
f1 = open('file1.txt', 'r')
f2 = open('file2.txt', 'w')
for line in f1:
f2.write(line.replace('old_text', 'new_text'))
f1.close()
f2.close()
You can also use pathlib.
from pathlib2 import Path
path = Path(file_to_search)
text = path.read_text()
text = text.replace(text_to_search, replacement_text)
path.write_text(text)
(pip install python-util)
from pyutil import filereplace
filereplace("somefile.txt","abcd","ram")
Will replace all occurences of "abcd" with "ram".
The function also supports regex by specifying regex=True
from pyutil import filereplace
filereplace("somefile.txt","\\w+","ram",regex=True)
Disclaimer: I'm the author (https://github.com/MisterL2/python-util)
Open the file in read mode. Read the file in string format. Replace the text as intended. Close the file. Again open the file in write mode. Finally, write the replaced text to the same file.
try:
with open("file_name", "r+") as text_file:
texts = text_file.read()
texts = texts.replace("to_replace", "replace_string")
with open(file_name, "w") as text_file:
text_file.write(texts)
except FileNotFoundError as f:
print("Could not find the file you are trying to read.")
Late answer, but this is what I use to find and replace inside a text file:
with open("test.txt") as r:
text = r.read().replace("THIS", "THAT")
with open("test.txt", "w") as w:
w.write(text)
DEMO
With a single with block, you can search and replace your text:
with open('file.txt','r+') as f:
filedata = f.read()
filedata = filedata.replace('abc','xyz')
f.truncate(0)
f.write(filedata)
Your problem stems from reading from and writing to the same file. Rather than opening fileToSearch for writing, open an actual temporary file and then after you're done and have closed tempFile, use os.rename to move the new file over fileToSearch.
My variant, one word at a time on the entire file.
I read it into memory.
def replace_word(infile,old_word,new_word):
if not os.path.isfile(infile):
print ("Error on replace_word, not a regular file: "+infile)
sys.exit(1)
f1=open(infile,'r').read()
f2=open(infile,'w')
m=f1.replace(old_word,new_word)
f2.write(m)
Using re.subn it is possible to have more control on the substitution process, such as word splitted over two lines, case-(in)sensitive match. Further, it returns the amount of matches which can be used to avoid waste of resources if the string is not found.
import re
file = # path to file
# they can be also raw string and regex
textToSearch = r'Ha.*O' # here an example with a regex
textToReplace = 'hallo'
# read and replace
with open(file, 'r') as fd:
# sample case-insensitive find-and-replace
text, counter = re.subn(textToSearch, textToReplace, fd.read(), re.I)
# check if there is at least a match
if counter > 0:
# edit the file
with open(file, 'w') as fd:
fd.write(text)
# summary result
print(f'{counter} occurence of "{textToSearch}" were replaced with "{textToReplace}".')
Some regex:
add the re.I flag, short form of re.IGNORECASE, for a case-insensitive match
for multi-line replacement re.subn(r'\n*'.join(textToSearch), textToReplace, fd.read()), depending on the data also '\n{,1}'. Notice that for this case textToSearch must be a pure string, not a regex!
Besides the answers already mentioned, here is an explanation of why you have some random characters at the end:
You are opening the file in r+ mode, not w mode. The key difference is that w mode clears the contents of the file as soon as you open it, whereas r+ doesn't.
This means that if your file content is "123456789" and you write "www" to it, you get "www456789". It overwrites the characters with the new input, but leaves any remaining input untouched.
You can clear a section of the file contents by using truncate(<startPosition>), but you are probably best off saving the updated file content to a string first, then doing truncate(0) and writing it all at once.
Or you can use my library :D
I got the same issue. The problem is that when you load a .txt in a variable you use it like an array of string while it's an array of character.
swapString = []
with open(filepath) as f:
s = f.read()
for each in s:
swapString.append(str(each).replace('this','that'))
s = swapString
print(s)
I tried this and used readlines instead of read
with open('dummy.txt','r') as file:
list = file.readlines()
print(f'before removal {list}')
for i in list[:]:
list.remove(i)
print(f'After removal {list}')
with open('dummy.txt','w+') as f:
for i in list:
f.write(i)
you can use sed or awk or grep in python (with some restrictions). Here is a very simple example. It changes banana to bananatoothpaste in the file. You can edit and use it. ( I tested it worked...note: if you are testing under windows you should install "sed" command and set the path first)
import os
file="a.txt"
oldtext="Banana"
newtext=" BananaToothpaste"
os.system('sed -i "s/{}/{}/g" {}'.format(oldtext,newtext,file))
#print(f'sed -i "s/{oldtext}/{newtext}/g" {file}')
print('This command was applied: sed -i "s/{}/{}/g" {}'.format(oldtext,newtext,file))
if you want to see results on the file directly apply: "type" for windows/ "cat" for linux:
####FOR WINDOWS:
os.popen("type " + file).read()
####FOR LINUX:
os.popen("cat " + file).read()
I have done this:
#!/usr/bin/env python3
import fileinput
import os
Dir = input ("Source directory: ")
os.chdir(Dir)
Filelist = os.listdir()
print('File list: ',Filelist)
NomeFile = input ("Insert file name: ")
CarOr = input ("Text to search: ")
CarNew = input ("New text: ")
with fileinput.FileInput(NomeFile, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace(CarOr, CarNew), end='')
file.close ()
I modified Jayram Singh's post slightly in order to replace every instance of a '!' character to a number which I wanted to increment with each instance. Thought it might be helpful to someone who wanted to modify a character that occurred more than once per line and wanted to iterate. Hope that helps someone. PS- I'm very new at coding so apologies if my post is inappropriate in any way, but this worked for me.
f1 = open('file1.txt', 'r')
f2 = open('file2.txt', 'w')
n = 1
# if word=='!'replace w/ [n] & increment n; else append same word to
# file2
for line in f1:
for word in line:
if word == '!':
f2.write(word.replace('!', f'[{n}]'))
n += 1
else:
f2.write(word)
f1.close()
f2.close()
def word_replace(filename,old,new):
c=0
with open(filename,'r+',encoding ='utf-8') as f:
a=f.read()
b=a.split()
for i in range(0,len(b)):
if b[i]==old:
c=c+1
old=old.center(len(old)+2)
new=new.center(len(new)+2)
d=a.replace(old,new,c)
f.truncate(0)
f.seek(0)
f.write(d)
print('All words have been replaced!!!')
I have worked this out as an exercise of a course: open file, find and replace string and write to a new file.
class Letter:
def __init__(self):
with open("./Input/Names/invited_names.txt", "r") as file:
# read the list of names
list_names = [line.rstrip() for line in file]
with open("./Input/Letters/starting_letter.docx", "r") as f:
# read letter
file_source = f.read()
for name in list_names:
with open(f"./Output/ReadyToSend/LetterTo{name}.docx", "w") as f:
# replace [name] with name of the list in the file
replace_string = file_source.replace('[name]', name)
# write to a new file
f.write(replace_string)
brief = Letter()
Like so:
def find_and_replace(file, word, replacement):
with open(file, 'r+') as f:
text = f.read()
f.write(text.replace(word, replacement))
def findReplace(find, replace):
import os
src = os.path.join(os.getcwd(), os.pardir)
for path, dirs, files in os.walk(os.path.abspath(src)):
for name in files:
if name.endswith('.py'):
filepath = os.path.join(path, name)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)
I'm trying to read a text from a text file, read lines, delete lines that contain specific string (in this case 'bad' and 'naughty').
The code I wrote goes like this:
infile = file('./oldfile.txt')
newopen = open('./newfile.txt', 'w')
for line in infile :
if 'bad' in line:
line = line.replace('.' , '')
if 'naughty' in line:
line = line.replace('.', '')
else:
newopen.write(line)
newopen.close()
I wrote like this but it doesn't work out.
One thing important is, if the content of the text was like this:
good baby
bad boy
good boy
normal boy
I don't want the output to have empty lines.
so not like:
good baby
good boy
normal boy
but like this:
good baby
good boy
normal boy
What should I edit from my code on the above?
You can make your code simpler and more readable like this
bad_words = ['bad', 'naughty']
with open('oldfile.txt') as oldfile, open('newfile.txt', 'w') as newfile:
for line in oldfile:
if not any(bad_word in line for bad_word in bad_words):
newfile.write(line)
using a Context Manager and any.
You could simply not include the line into the new file instead of doing replace.
for line in infile :
if 'bad' not in line and 'naughty' not in line:
newopen.write(line)
I have used this to remove unwanted words from text files:
bad_words = ['abc', 'def', 'ghi', 'jkl']
with open('List of words.txt') as badfile, open('Clean list of words.txt', 'w') as cleanfile:
for line in badfile:
clean = True
for word in bad_words:
if word in line:
clean = False
if clean == True:
cleanfile.write(line)
Or to do the same for all files in a directory:
import os
bad_words = ['abc', 'def', 'ghi', 'jkl']
for root, dirs, files in os.walk(".", topdown = True):
for file in files:
if '.txt' in file:
with open(file) as filename, open('clean '+file, 'w') as cleanfile:
for line in filename:
clean = True
for word in bad_words:
if word in line:
clean = False
if clean == True:
cleanfile.write(line)
I'm sure there must be a more elegant way to do it, but this did what I wanted it to.
Today I needed to accomplish a similar task so I wrote up a gist to accomplish the task based on some research I did.
I hope that someone will find this useful!
import os
os.system('cls' if os.name == 'nt' else 'clear')
oldfile = raw_input('{*} Enter the file (with extension) you would like to strip domains from: ')
newfile = raw_input('{*} Enter the name of the file (with extension) you would like me to save: ')
emailDomains = ['windstream.net', 'mail.com', 'google.com', 'web.de', 'email', 'yandex.ru', 'ymail', 'mail.eu', 'mail.bg', 'comcast.net', 'yahoo', 'Yahoo', 'gmail', 'Gmail', 'GMAIL', 'hotmail', 'comcast', 'bellsouth.net', 'verizon.net', 'att.net', 'roadrunner.com', 'charter.net', 'mail.ru', '#live', 'icloud', '#aol', 'facebook', 'outlook', 'myspace', 'rocketmail']
print "\n[*] This script will remove records that contain the following strings: \n\n", emailDomains
raw_input("\n[!] Press any key to start...\n")
linecounter = 0
with open(oldfile) as oFile, open(newfile, 'w') as nFile:
for line in oFile:
if not any(domain in line for domain in emailDomains):
nFile.write(line)
linecounter = linecounter + 1
print '[*] - {%s} Writing verified record to %s ---{ %s' % (linecounter, newfile, line)
print '[*] === COMPLETE === [*]'
print '[*] %s was saved' % newfile
print '[*] There are %s records in your saved file.' % linecounter
Link to Gist: emailStripper.py
Best,
Az
Use python-textops package :
from textops import *
'oldfile.txt' | cat() | grepv('bad') | tofile('newfile.txt')
The else is only connected to the last if. You want elif:
if 'bad' in line:
pass
elif 'naughty' in line:
pass
else:
newopen.write(line)
Also note that I removed the line substitution, as you don't write those lines anyway.
Try this works well.
import re
text = "this is bad!"
text = re.sub(r"(.*?)bad(.*?)$|\n", "", text)
text = re.sub(r"(.*?)naughty(.*?)$|\n", "", text)
print(text)
Regex is a little quicker than the accepted answer (for my 23 MB test file) that I used. But there isn't a lot in it.
import re
bad_words = ['bad', 'naughty']
regex = f"^.*(:{'|'.join(bad_words)}).*\n"
subst = ""
with open('oldfile.txt') as oldfile:
lines = oldfile.read()
result = re.sub(regex, subst, lines, re.MULTILINE)
with open('newfile.txt', 'w') as newfile:
newfile.write(result)
to_skip = ("bad", "naughty")
out_handle = open("testout", "w")
with open("testin", "r") as handle:
for line in handle:
if set(line.split(" ")).intersection(to_skip):
continue
out_handle.write(line)
out_handle.close()
bad_words = ['doc:', 'strickland:','\n']
with open('linetest.txt') as oldfile, open('linetestnew.txt', 'w') as newfile:
for line in oldfile:
if not any(bad_word in line for bad_word in bad_words):
newfile.write(line)
The \n is a Unicode escape sequence for a newline.