Deleting one line within txt file in Python - python

I am having problems deleting a specific line/entry within a text file. With the code I have the top line in the file is deleted no matter what line number I select to delete.
def erase():
contents = {}
f = open('members.txt', 'a')
f.close()
f = open('members.txt', 'r')
index = 0
for line in f:
index = index + 1
contents[index] = line
print ("{0:3d}) {1}".format(index,line))
f.close()
total = index
entry = input("Enter number to be deleted")
f = open('members.txt', 'w')
index = 0
for index in range(1,total):
index = index + 1
if index != entry:
f.write(contents[index])

Try this:
import sys
import os
def erase(file):
assert os.path.isfile(file)
with open(file, 'r') as f:
content = f.read().split("\n")
#print content
entry = input("Enter number to be deleted:")
assert entry >= 0 and entry < len(content)
new_file = content[:entry] + content[entry+1:]
#print new_file
with open(file,'w') as f:
f.write("\n".join(new_file))
if __name__ == '__main__':
erase(sys.argv[1])
As already noted you were starting the range from 1 which is incorrect. List slicing which I used in new_file = content[:entry] + content[entry+1:] makes the code more readable and it is an approach less prone to similar errors.
Also you seem to open and close the input file at the beginning for no reason. Also you should use with if possible when doing operations with files.
Finally I used the join and split to simplify the code so you don't need a for loop to process the lines of the file.

Related

How to create multiple files after each 10,000 records in for loop after fetch?

I have about 1,000,000 records that I would like to loop through and write the contents in about 10 different files.
Below is my code that successfully create a single large file :
f = open("c:\temp\filename", 'w+')
# write to file
for rec in full_data:
f.write(rec[0])
f.write("\n")
I want to create 10 files with 100,000 records in each file.
I tried creating multiple files but this will create file for each row:
for index, line in enumerate(full_data):
with open('filename{}.txt'.format(index), 'w') as output:
output.write(line[0])
Please let me know how can create each file for 100,00 records.
That's because at each cicle iteration you are creating a new file as output, try to do that only when necessary.
for example, the following piece of code create a file every three items:
>>> data = [1,2,3,4,5,6,7,8,9,10]
>>> output = open('filename0.txt', 'w')
>>> for index, line in enumerate(data):
... if index % 3 == 0:
... output = open('filename{}.txt'.format(index), 'w')
... output.write(str(line) + "\n")
I was able to resolve this issue using below code:
f = open(full_file_name, 'w+')
rec_counter = 0
file_name_counter = 1
# write to file
for index,rec in enumerate(full_run_data):
if rec_counter == 500000:
f = open(repeat_file_name.format(file_name_counter), 'w')
#print('%s' % (rec[0]))
f.write(rec[0])
f.write("\n")
rec_counter = 0
file_name_counter = file_name_counter + 1
else:
rec_counter = rec_counter +1
f.write(rec[0])
f.write("\n")
f.close()

Removing phrases from a file based on another file (Python)

How do I do this in python?
badphrases.txt contains
Go away
Don't do that
Stop it
allphrases.txt contains
I don't know why you do that. Go away.
I was wondering what you were doing.
You seem nice
I want allphrases.txt to be clean of the lines in badphrases.txt.
It's trivial in bash
cat badfiles.txt | while read b
do
cat allphrases.txt | grep -v "$b" > tmp
cat tmp > allphrases.txt
done
Oh, you thought I hadn't looked or tried. I searched for over and hour.
Here's my code:
# Files
ttv = "/tmp/tv.dat"
tmp = "/tmp/tempfile"
bad = "/tmp/badshows"
badfiles already exists
...code right here creates ttv
# Function grep_v
def grep_v(f,str):
file = open(f, "r")
for line in file:
if line in str:
return True
return False
t = open(tmp, 'w')
tfile = open(ttv, "r")
for line in tfile:
if not grep_v(bad,line):
t.write(line)
tfile.close
t.close
os.rename(tmp, ttv)
First google how to read a file in python:
you will probably get something like this: How do I read a file line-by-line into a list?
Use this to read both the files in lists
with open('badphrases.txt') as f:
content = f.readlines()
badphrases = [x.strip() for x in content]
with open('allphrases.txt') as f:
content = f.readlines()
allphrases = [x.strip() for x in content]
Now you have both the content in lists.
Iterate over allphrases and check if phrases from badphrases are present in it.
At this point you might consider google :
how to iterate over a list python
how to check if string present in another string python
Take the code from those places and built a brute-force algo like this:
for line in allphrases:
flag = True
for badphrase in badphrases:
if badphrase in line:
flag = False
break
if flag:
print(line)
If you can understand this code then you will notice you need to replace print with output to file:
Now google how to print to file python.
Then think about how to improve the algorithm. All the best.
UPDATE:
#COLDSPEED suggested you can simple google
- how to replace lines in a file in python:
You might get something like this: Search and replace a line in a file in Python
Which also works.
Solution not too bad.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import feedparser, os, re
# Files
h = os.environ['HOME']
ttv = h + "/WEB/Shows/tv.dat"
old = h + "/WEB/Shows/old.dat"
moo = h + "/WEB/Shows/moo.dat"
tmp = h + "/WEB/Shows/tempfile"
bad = h + "/WEB/Shows/badshows"
# Function not_present
def not_present(f,str):
file = open(f, "r")
for line in file:
if str in line:
return False
return True
# Sources (shortened)
sources = ['http://predb.me/?cats=tv&rss=1']
# Grab all the feeds and put them into ttv and old
k = open(old, 'a')
f = open(ttv, 'a')
for h in sources:
d = feedparser.parse(h)
for post in d.entries:
if not_present(old,post.link):
f.write(post.title + "|" + post.link + "\n")
k.write(post.title + "|" + post.link + "\n")
f.close
k.close
# Remove shows without [Ss][0-9] and put them in moo
m = open(moo, 'a')
t = open(tmp, 'w')
file = open(ttv, "r")
for line in file:
if re.search(r's[0-9]', line, re.I) is None:
m.write(line)
# print("moo", line)
else:
t.write(line)
# print("tmp", line)
t.close
m.close
os.rename(tmp, ttv)
# Remove badshows
t = open(tmp, 'w')
with open(bad) as f:
content = f.readlines()
bap = [x.strip() for x in content]
with open(ttv) as f:
content = f.readlines()
all = [x.strip() for x in content]
for line in all:
flag = True
for b in bap:
if b in line:
flag = False
break
if flag:
t.write(line + "\n")
t.close
os.rename(tmp, ttv)

Python variable/count not increasing

My code attempts to create a folder which then downloads a pdf to the corresponding folder. In my current code the variable and counter "i" keeps track of which folder to download to but it seems to not be updating for some reason. At the end of the else if statement I want the variable i to increase by 1. Not understanding what the issue is here, I'm fairly new to python and if a similar situation was coded in java I know this would work just file but not sure why it's not working in python.
import re
import os
import urllib
f = open("newfile.txt")
suffix = '.pdf'
for line in f:
i = 0
folderopt = str(i)
if suffix in line:
print('download')
url = line.rstrip('\n')
pdfname = url.split('LTN',1)[1]
print ('download to:'+'/Users/user/Desktop/PDF/'+folderopt+'/'+pdfname)
urllib.urlretrieve(url,'/Users/user/Desktop/PDF/'+folderopt+'/'+pdfname)
elif line>i:
filename = line.rstrip('\n')
print ('code:'+filename)
os.mkdir('/Users/user/Desktop/PDF/'+filename)
global i
i = i+1
f.close
EDIT: I put the variable outside of the for loop still getting this
IOError: [Errno 2] No such file or directory: '/Users/user/Desktop/PDF/0/20160412398.pdf'
the i count has not increased even though the folder /Users/user/Desktop/PDF/1 was created.
changed elif statement to
elif int(line.rstrip('\n'))>i
still not working
You set i to 0 in the loop,try this:
import re
import os
import urllib
f = open("newfile.txt")
suffix = '.pdf'
i = 0
for line in f:
folderopt = str(i)
if suffix in line:
print('download')
url = line.rstrip('\n')
pdfname = url.split('LTN',1)[1]
print ('download to:'+'/Users/user/Desktop/PDF/'+folderopt+'/'+pdfname)
urllib.urlretrieve(url,'/Users/user/Desktop/PDF/'+folderopt+'/'+pdfname)
elif line>i:
filename = line.rstrip('\n')
print ('code:'+filename)
os.mkdir('/Users/user/Desktop/PDF/'+filename)
global i
i+= 1
f.close
Thats because the first line in your loop sets i to 0.
Try
i = 0
for line in f:
instead of
for line in f:
i = 0
EDIT:
Also, the i+=1 should be outside the elif condition, like so
elif:
...
i+=1
not
elif:
...
i+=1

How to split text file by id in python

I have a bunch of text files containing tab separated tables. The second column contains an id number, and each file is already sorted by that id number. I want to separate each file into multiple files by the id number in column 2. Here's what I have.
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(readpath+filename, 'r') as fh:
lines = fh.readlines()
lastid = 0
f = open(writepath+'checkme.txt', 'w')
f.write(filename)
for line in lines:
thisid = line.split("\t")[1]
if int(thisid) <> lastid:
f.close()
f = open(writepath+thisid+'-'+filename,'w')
lastid = int(thisid)
f.write(line)
f.close()
What I get is simply a copy of all the read files with the first id number from each file in front of the new filenames. It is as if
thisid = line.split("\t")[1]
is only done once in the loop. Any clue to what is going on?
EDIT
The problem was my files used \r rather than \r\n to terminate lines. Corrected code (simply adding 'rU' when opening the read file and swapping != for <>):
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(readpath+filename, 'rU') as fh:
lines = fh.readlines()
lastid = 0
f = open(writepath+'checkme.txt', 'w')
f.write(filename)
for line in lines:
thisid = line.split("\t")[1]
if int(thisid) != lastid:
f.close()
f = open(writepath+thisid+'-'+filename,'w')
lastid = int(thisid)
f.write(line)
f.close()
If you're dealing with tab delimited files, then you can use the csv module, and take advantage of the fact that itertools.groupby will do the previous/current tracking of the id for you. Also utilise os.path.join to make sure your filenames end up joining correctly.
Untested:
import os
import csv
from itertools import groupby
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(os.path.join(readpath, filename)) as fin:
tabin = csv.reader(fin, delimiter='\t')
for file_id, rows in groupby(tabin, lambda L: L[1]):
with open(os.path.join(writepath, file_id + '-' + filename), 'w') as fout:
tabout = csv.writer(fout, delimiter='\t')
tabout.writerows(rows)

Python -- How to split headers/chapters into separate files automatically

I'm converting text directly to epub and I'm having a problem automatically splitting the HTML book file into separate header/chapter files. At the moment, the code below partially works but only creates every other chapter file. So half the header/chapter files are missing from the output. Here is the code:
def splitHeaderstoFiles(fpath):
infp = open(fpath, 'rt', encoding=('utf-8'))
for line in infp:
# format and split headers to files
if '<h1' in line:
#-----------format header file names and other stuff ------------#
# create a new file for the header/chapter section
path = os.getcwd() + os.sep + header
with open(path, 'wt', encoding=('utf-8')) as outfp:
# write html top meta headers
outfp = addMetaHeaders(outfp)
# add the header
outfp = outfp.write(line)
# add the chapter/header bodytext
for line in infp:
if '<h1' not in line:
outfp.write(line)
else:
outfp.write('</body>\n</html>')
break
else:
continue
infp.close()
The problem occurs in the second 'for loop' at the bottom of the code, when I look for the next h1 tag to stop the split. I cannot use seek() or tell() to rewind or move back one line so the program can find the next header/chapter on the next iteration. Apparently you cannot use these in python in a for loop containing an implicit iter or next object in operation. Just gives a 'can't do non-zero cur-relative seeks' error.
I've also tried the while line != ' ' + readline() combination in the code which also gives the same error as above.
Does anyone know an easy way to split HTML headers/chapters of varying lengths into separate files in python? Are there any special python modules(such as pickles) that could help make this task easier?
I'm using Python 3.4
My grateful thanks in advance for any solutions to this problem...
I ran into similar problem a while ago, here is a simplified solution:
from itertools import count
chapter_number = count(1)
output_file = open('000-intro.html', 'wb')
with open('index.html', 'rt') as input_file:
for line in input_file:
if '<h1' in line:
output_file.close()
output_file = open('{:03}-chapter'.format(next(chapter_number)), 'wb')
output_file.write(line)
output_file.close()
In this approach, the first block of text leading to the first h1 block is written into 000-intro.html, the first chapter will be written into 001-chapter.html and so on. Please modify it to taste.
The solution is a simple one: Upon encountering the h1 tag, close the last output file and open a new one.
You are looping over your input file twice, which is likely causing your problems:
for line in infp:
...
with open(path, 'wt', encoding=('utf-8')) as outfp:
...
for line in infp:
...
Each for is going to have it's own iterator, so you are going to loop over the file many times.
You might try transforming your for loop into a while so you're not using two different iterators:
while infp:
line = infp.readline()
if '<h1' in line:
with open(...) as outfp:
while infp:
line = infp.readline()
if '<h1' in line:
break
outfp.writeline(...)
Alternatively, you may wish to use an HTML parser (i.e., BeautifulSoup). Then you can do something like what is described here: https://stackoverflow.com/a/8735688/65295.
Update from comment - essentially, read the entire file all at once so you can freely move back or forward as necessary. This probably won't be a performance issue unless you have a really really big file (or very little memory).
lines = infp.readlines() # read the entire file
i = 0
while i < len(lines):
if '<h1' in lines[i]:
with open(...) as outfp:
j = i + 1
while j < len(lines):
if '<h1' in lines[j]:
break
outfp.writeline(lines[j])
# line j has an <h1>, set i to j so we detect the it at the
# top of the next loop iteration.
i = j
else:
i += 1
I eventually found the answer to the above problem. The code below does alot more that just get the file header. It also simultaneously loads two parallel list arrays with formatted file name data(with extension) and pure header name data respectively so I can use these lists to fill in the and formatted filename extension in these html files within a while loop in one hit. The code now works well and is shown below.
def splitHeaderstoFiles(dir, inpath):
count = 1
t_count = 0
out_path = ''
header = ''
write_bodytext = False
file_path_names = []
pure_header_names = []
inpath = dir + os.sep + inpath
with open(inpath, 'rt', encoding=('utf-8')) as infp:
for line in infp:
if '<h1' in line:
#strip html tags, convert to start caps
p = re.compile(r'<.*?>')
header = p.sub('', line)
header = capwords(header)
line_save = header
# Add 0 for count below 10
if count < 10:
header = '0' + str(count) + '_' + header
else:
header = str(count) + '_' + header
# remove all spaces + add extension in header
header = header.replace(' ', '_')
header = header + '.xhtml'
count = count + 1
#create two parallel lists used later
out_path = dir + os.sep + header
outfp = open(out_path, 'wt', encoding=('utf-8'))
file_path_names.insert(t_count, out_path)
pure_header_names.insert(t_count, line_save)
t_count = t_count + 1
# Add html meta headers and write it
outfp = addMainHeaders(outfp)
outfp.write(line)
write_bodytext = True
# add header bodytext
elif write_bodytext == True:
outfp.write(line)
# now add html titles and close the html tails on all files
max_num_files = len(file_path_names)
tmp = dir + os.sep + 'temp1.tmp'
i = 0
while i < max_num_files:
outfp = open(tmp, 'wt', encoding=('utf-8'))
infp = open(file_path_names[i], 'rt', encoding=('utf-8'))
for line in infp:
if '<title>' in line:
line = line.strip(' ')
line = line.replace('<title></title>', '<title>' + pure_header_names[i] + '</title>')
outfp.write(line)
else:
outfp.write(line)
# add the html tail
if '</body>' in line or '</html>' in line:
pass
else:
outfp.write(' </body>' + '\n</html>')
# clean up
infp.close()
outfp.close()
shutil.copy2(tmp, file_path_names[i])
os.remove(tmp)
i = i + 1
# now rename just the title page
if os.path.isfile(file_path_names[0]):
title_page_name = file_path_names[0]
new_title_page_name = dir + os.sep + '01_Title.xhtml'
os.rename(title_page_name, new_title_page_name)
file_path_names[0] = '01_Title.xhtml'
else:
logmsg27(DEBUG_FLAG)
os._exit(0)
# xhtml file is no longer needed
if os.path.isfile(inpath):
os.remove(inpath)
# returned list values are also used
# later to create epub opf and ncx files
return(file_path_names, pure_header_names)
#Hai Vu and #Seth -- Thanks for all your help.

Categories