Data generation Python - python

I'm trying to generate a dataset based on an existing one, I was able to implement a method to randomly change the contents of files, but I can’t write all this to a file. Moreover, I also need to write the number of changed words to the file, since I want to use this dataset to train a neural network, could you help me?
Input: files with 2 lines of text in each.
Output: files with 3(maybe) lines: the first line does not change, the second changes according to the method, the third shows the number of words changed (if for deep learning tasks it is better to do otherwise, I would be glad to advice, since I'm a beginner)
from random import randrange
import os
Path = "D:\corrected data\\"
filelist = os.listdir(Path)
if __name__ == "__main__":
new_words = ['consultable', 'partie ', 'celle ', 'également ', 'forte ', 'statistiques ', 'langue ',
'cadeaux', 'publications ', 'notre', 'nous', 'pour', 'suivr', 'les', 'vos', 'visitez ', 'thème ', 'thème ', 'thème ', 'produits', 'coulisses ', 'un ', 'atelier ', 'concevoir ', 'personnalisés ', 'consultable', 'découvrir ', 'fournit ', 'trace ', 'dire ', 'tableau', 'décrire', 'grande ', 'feuille ', 'noter ', 'correspondant', 'propre',]
nb_words_to_replace = randrange(10)
#with open("1.txt") as file:
for i in filelist:
# if i.endswith(".txt"):
with open(Path + i,"r",encoding="utf-8") as file:
# for line in file:
data = file.readlines()
first_line = data[0]
second_line = data[1]
print(f"Original: {second_line}")
# print(f"FIle: {file}")
second_line_array = second_line.split(" ")
for j in range(nb_words_to_replace):
replacement_position = randrange(len(second_line_array))
old_word = second_line_array[replacement_position]
new_word = new_words[randrange(len(new_words))]
print(f"Position {replacement_position} : {old_word} -> {new_word}")
second_line_array[replacement_position] = new_word
res = " ".join(second_line_array)
print(f"Result: {res}")
with open(Path + i,"w") as f:
for line in file:
if line == second_line:
f.write(res)

In short, you have two questions:
How to properly replace line number 2 (and 3) of the file.
How to keep track of number of words changed.
How to properly replace line number 2 (and 3) of the file.
Your code:
with open(Path + i,"w") as f:
for line in file:
if line == second_line:
f.write(res)
Reading is not enabled. for line in file will not work. fis defined, but file is used instead. To fix this, do the following instead:
with open(Path + i,"r+") as file:
lines = file.read().splitlines() # splitlines() removes the \n characters
lines[1] = second_line
file.writelines(lines)
However, you want to add more lines to it. I suggest you structure the logic differently.
How to keep track of number of words changed.
Add varaible changed_words_count and increment it when old_word != new_word
Resulting code:
for i in filelist:
filepath = Path + i
# The lines that will be replacing the file
new_lines = [""] * 3
with open(filepath, "r", encoding="utf-8") as file:
data = file.readlines()
first_line = data[0]
second_line = data[1]
second_line_array = second_line.split(" ")
changed_words_count = 0
for j in range(nb_words_to_replace):
replacement_position = randrange(len(second_line_array))
old_word = second_line_array[replacement_position]
new_word = new_words[randrange(len(new_words))]
# A word replaced does not mean the word has changed.
# It could be replacing itself.
# Check if the replacing word is different
if old_word != new_word:
changed_words_count += 1
second_line_array[replacement_position] = new_word
# Add the lines to the new file lines
new_lines[0] = first_line
new_lines[1] = " ".join(second_line_array)
new_lines[2] = str(changed_words_count)
print(f"Result: {new_lines[1]}")
with open(filepath, "w") as file:
file.writelines(new_lines)
Note: Code not tested.

Related

List index out of range with stanford-nlp

I'm trying to remove all blank lines from a large .txt file but whatever method I use it always returns this traceback:
Traceback (most recent call last):
File "C:\Users\svp12\PycharmProjects\practiques\main.py", line 53, in <module>
doc = nlp(texts[line])
IndexError: list index out of range
If I don't remove these spaces then I get IndexErrors on the consequent 2 for loops (or at least I think that's the reason), that's why I'm using the the try/except like this:
try:
for word in doc.sentences[0].words:
noun.append(word.text)
lemma.append(word.lemma)
pos.append(word.pos)
xpos.append(word.xpos)
deprel.append(word.deprel)
except IndexError:
errors += 1
pass
I'd like to be able to remove all blank lines and not have to avoid IndexErrors like this, any idea on how to fix?
Here's the whole code:
import io
import stanza
import os
def linecount(filename):
ffile = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = ffile.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines
errors = 0
with io.open('#_Calvia_2018-01-01_2022-04-01.txt', 'r+', encoding='utf-8') as f:
text = f.read()
# replacing eos with \n, numbers and symbols
texts = text.replace('eos', '.\n')
texts = texts.replace('0', ' ').replace('1', ' ').replace('2', ' ').replace('3', ' ').replace('4', ' ')\
.replace('5', ' ').replace('6', ' ').replace('7', ' ').replace('8', ' ').replace('9', ' ').replace(',', ' ')\
.replace('"', ' ').replace('·', ' ').replace('?', ' ').replace('¿', ' ').replace(':', ' ').replace(';', ' ')\
.replace('-', ' ').replace('!', ' ').replace('¡', ' ').replace('.', ' ').splitlines()
os.system("sed -i \'/^$/d\' #_Calvia_2018-01-01_2022-04-01.txt") # removing empty lines to avoid IndexError
nlp = stanza.Pipeline(lang='ca')
nouns = []
lemmas = []
poses = []
xposes = []
heads = []
deprels = []
total_lines = linecount('#_Calvia_2018-01-01_2022-04-01.txt') - 1
for line in range(50): # range should be total_lines which is 6682
noun = []
lemma = []
pos = []
xpos = []
head = []
deprel = []
# print('analyzing: '+str(line+1)+' / '+str(len(texts)), end='\r')
doc = nlp(texts[line])
try:
for word in doc.sentences[0].words:
noun.append(word.text)
lemma.append(word.lemma)
pos.append(word.pos)
xpos.append(word.xpos)
deprel.append(word.deprel)
except IndexError:
errors += 1
pass
try:
for word in doc.sentences[0].words:
head.extend([lemma[word.head-1] if word.head > 0 else "root"])
except IndexError:
errors += 1
pass
nouns.append(noun)
lemmas.append(lemma)
poses.append(pos)
xposes.append(xpos)
heads.append(head)
deprels.append(deprel)
print(nouns)
print(lemmas)
print(poses)
print(xposes)
print(heads)
print(deprels)
print("errors: " + str(errors)) # wierd, seems to be range/2-1
And as a side question, is worth to import os just for this line? (which is the one removing the blank lines
os.system("sed -i \'/^$/d\' #_Calvia_2018-01-01_2022-04-01.txt")
I can't guarantee that this works because I couldn't test it, but it should give you an idea of how you'd approach this task in Python.
I'm omitting the head processing/the second loop here, that's for you to figure out.
I'd recommend you throw some prints in there and look at the output, make sure you understand what's going on (especially with different data types) and look at examples of applications using Stanford NLP, watch some tutorials online (from start to finish, no skipping), etc.
import stanza
import re
def clean(line):
# function that does the text cleaning
line = line.replace('eos', '.\n')
line = re.sub(r'[\d,"·?¿:;!¡.-]', ' ', line)
return line.strip()
nlp = stanza.Pipeline(lang='ca')
# instead of individual variables, you could keep the values in a dictionary
# (or just leave them as they are - your call)
values_to_extract = ['text', 'lemma', 'pos', 'xpos', 'deprel']
data = {v:[] for v in values_to_extract}
with open('#_Calvia_2018-01-01_2022-04-01.txt', 'r', encoding='utf-8') as f:
for line in f:
# clean the text
line = clean(line)
# skip empty lines
if not line:
continue
doc = nlp(line)
# loop over sentences – this will work even if it's an empty list
for sentence in doc.sentences:
# append a new list to the dictionary entries
for v in values_to_extract:
data[v].append([])
for word in sentence.words:
for v in values_to_extract:
# extract the attribute (e.g.,
# a surface form, a lemma, a pos tag, etc.)
attribute = getattr(word, v)
# and add it to its slot
data[v][-1].append(attribute)
for v in values_to_extract:
print('Value:', v)
print(data[v])
print()
Because texts doesn't have 50 lines, why do you hardcode 50?
If you just need to remove blank lines you only have to do text = text.replace("\n\n","\n")
if you need to remove lines that are just whitespaces you can just do:
text = '\n'.join(line.rstrip() for line in text.split('\n') if line.strip())

Reading and Printing multiple files into one outfile

When reading and printing through my files, printing through my cousole gives me the correct result, but writing to the outfile does not
with infile as f :
lines = f.readlines()
new_line = " "
for line in lines:
new_line = ''.join(line).replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
print(new_line)
This prints out all of the letters that I inputted
with infile as f :
lines = f.readlines()
new_line = " "
for line in lines:
new_line = ''.join(line).replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
outfile.write(new_line)
It only gives me the last letter of the word inputted.
folder = r"C:\Users\sarah\Documents\a CPS 111\Bonus PA\stars\stars"
# os.listdir(folder) returns a list of files in folder
file_list = os.listdir(folder)
letter_art = {}
word = str(input("Please input a letter: "))
word = word.upper()
for fname in file_list:
letter_extension_list = fname.split(".")
for letter in word:
key = letter
value = letter_extension_list[1]
value = "%s."%(key) + value
letter_art[key] = value
fname = "\\".join([folder, value])
infile = open(fname, "r")
outfile = open("word_art.txt", "w")
with infile as f :
lines = f.readlines()
new_line = " "
for line in lines:
new_line = ''.join(line).replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
print(new_line)
outfile.write(new_line)
infile.close()
outfile.close()
This is the code I am currently working with. I am taking in symbols from a txt file and changing them to the coornading letter depending on what the user inputed
Open the output file before the loop instead of within it:
outfile = open("word_art.txt", "w")
for letter in word:
with open("test.txt",'r') as f :
lines = f.readlines()
with open('out.txt','w') as outfile:
for line in lines:
new_line = line.replace('*',letter.upper())
new_line = new_line.replace(':',letter.lower())
outfile.write(new_line)
This worked for me.
EDIT:
TigerhawkT3 is correct. I checked out your full code and you were opening the file again and again inside the loop, each time discarding the prior changes.

Python: Add a new line after the first word in a sentence if the first word is all caps

I'm trying to modify a txt file. The file is a movie script in the format:
BEN We’ve discussed this before.
LUKE I can be a Jedi. I’m ready.
I'd like insert a new line after the character:
BEN
We’ve discussed this before.
LUKE
I can be a Jedi. I’m ready.
How do I do this in python? I currently have:
def modify_file(file_name):
fh=fileinput.input(file_name,inplace=True)
for line in fh:
split_line = line.split()
if(len(split_line)>0):
first_word = split_line[0]
replacement = first_word+'\n'
first_word=first_word.replace(first_word,replacement)
sys.stdout.write(first_word)
fh.close()
As suggested in one of the comments, this can be done using split and isupper. An example is provided below:
source_path = 'source_path.txt'
f = open(source_path)
lines = f.readlines()
f.close()
temp = ''
for line in lines:
words = line.split(' ')
if words[0].isupper():
temp += words[0] + '\n' + ' '.join(words[1:])
else:
temp += line
f = open(source_path, 'w')
f.write(temp)
f.close()
There are multiple problems with your code.
import fileinput
def modify_file(file_name):
fh=fileinput.input("output.txt",inplace=True)
for line in fh:
split_line = line.split()
if(len(split_line)>0):
x=split_line[0]+"\n"+" ".join(split_line[1:])+"\n"
sys.stdout.write(x)
fh.close() #==>this cannot be in the if loop.It has to be at the outer for level

Pick parts from a txt file and copy to another file with python

I'm in trouble here. I need to read a file. Txt file that contains a sequence of records, check the records that I want to copy them to a new file.
The file content is like this (this is just an example, the original file has more than 30 000 lines):
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|316 #begin register
03000|SP|467
99999|33|130 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
The records that begin with 03000 and have the characters 'TO' must be written to a new file. Based on the example, the file should look like this:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Code:
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.read()
file.close()
# here I need to check if the record exists 03000 characters 'TO', if it exists, copy the recordset 00000-99999 for the new file.
I did multiple searches and found nothing to help me.
Thank you!
with open("file.txt",'r') as inFile, open("newFile.txt","w") as outFile:
outFile.writelines(line for line in inFile
if line.startswith("03000") and "TO" in line)
If you need the previous and the next line, then you have to iterate inFile in triads. First define:
def gen_triad(lines, prev=None):
after = current = next(lines)
for after in lines:
yield prev, current, after
prev, current = current, after
And then do like before:
outFile.writelines(''.join(triad) for triad in gen_triad(inFile)
if triad[1].startswith("03000") and "TO" in triad[1])
import re
pat = ('^00000\|\d+\|\d+.*\n'
'^03000\|TO\|\d+.*\n'
'^99999\|\d+\|\d+.*\n'
'|'
'^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*')
rag = re.compile(pat,re.MULTILINE)
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(rag.findall(f.read())))
For files with additional lines between lines beginning with 00000, 03000 and 99999, I didn't find simpler code than this one:
import re
pat = ('(^00000\|\d+\|\d+.*\n'
'(?:.*\n)+?'
'^99999\|\d+\|\d+.*\n)'
'|'
'(^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*)')
rag = re.compile(pat,re.MULTILINE)
pit = ('^00000\|.+?^03000\|TO\|\d+.+?^99999\|')
rig = re.compile(pit,re.DOTALL|re.MULTILINE)
def yi(text):
for g1,g2 in rag.findall(text):
if g2:
yield g2
elif rig.match(g1):
yield g1
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(yi(f.read())))
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.readlines()
file.close()
newFile.writelines(filter(lambda x:x.startswith("03000") and "TO" in x,content))
This seems to work. The other answers seem to only be writing out records that contain '03000|TO|' but you have to write out the record before and after that as well.
import sys
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# import file
file_name = sys.argv[1]
file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name
file = open(file_path,"r")
# ---------------------------------------------------------------
# create output files
output_file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name + '.out'
output_file = open(output_file_path,"w")
# create output files
# ---------------------------------------------------------------
# process file
temp = ''
temp_out = ''
good_write = False
bad_write = False
for line in file:
if line[:5] == 'AAAAA':
temp_out += line
elif line[:5] == 'ZZZZZ':
temp_out += line
elif good_write:
temp += line
temp_out += temp
temp = ''
good_write = False
elif bad_write:
bad_write = False
temp = ''
elif line[:5] == '03000':
if line[6:8] != 'TO':
temp = ''
bad_write = True
else:
good_write = True
temp += line
temp_out += temp
temp = ''
else:
temp += line
output_file.write(temp_out)
output_file.close()
file.close()
Output:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Does it have to be python? These shell commands would do the same thing in a pinch.
head -1 inputfile.txt > outputfile.txt
grep -C 1 "03000|TO" inputfile.txt >> outputfile.txt
tail -1 inputfile.txt >> outputfile.txt
# Whenever I have to parse text files I prefer to use regular expressions
# You can also customize the matching criteria if you want to
import re
what_is_being_searched = re.compile("^03000.*TO")
# don't use "file" as a variable name since it is (was?) a builtin
# function
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
for this_line in source_file:
if what_is_being_searched.match(this_line):
destination_file.write(this_line)
and for those who prefer a more compact representation:
import re
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
destination_file.writelines(this_line for this_line in source_file
if re.match("^03000.*TO", this_line))
code:
fileName = '1'
fil = open(fileName,'r')
import string
##step 1: parse the file.
parsedFile = []
for i in fil:
##tuple1 = (1,2,3)
firstPipe = i.find('|')
secondPipe = i.find('|',firstPipe+1)
tuple1 = (i[:firstPipe],\
i[firstPipe+1:secondPipe],\
i[secondPipe+1:i.find('\n')])
parsedFile.append(tuple1)
fil.close()
##search criterias:
searchFirst = '03000'
searchString = 'TO' ##can be changed if and when required
##step 2: used the parsed contents to write the new file
filout = open('newFile','w')
stringToWrite = parsedFile[0][0] + '|' + parsedFile[0][1] + '|' + parsedFile[0][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
for i in range(1,len(parsedFile)):
if parsedFile[i][1] == searchString and parsedFile[i][0] == searchFirst:
for j in range(-1,2,1):
stringToWrite = parsedFile[i+j][0] + '|' + parsedFile[i+j][1] + '|' + parsedFile[i+j][2] + '\n'
filout.write(stringToWrite)
stringToWrite = parsedFile[-1][0] + '|' + parsedFile[-1][1] + '|' + parsedFile[-1][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
filout.close()
I know that this solution may be a bit long. But it is quite easy to understand. And it seems an intuitive way to do it. And I have already checked this with the Data that you have provided and it works perfectly.
Please tell me if you need some more explanation on the code. I will definitely add the same.
I tip (Beasley and Joran elyase) very interesting, but it only allows to get the contents of the line 03000. I would like to get the contents of the lines 00000 to line 99999.
I even managed to do here, but I am not satisfied, I wanted to make a more cleaner.
See how I did:
file = open(url,'r')
newFile = open("newFile.txt",'w')
lines = file.readlines()
file.close()
i = 0
lineTemp = []
for line in lines:
lineTemp.append(line)
if line[0:5] == '03000':
state = line[21:23]
if line[0:5] == '99999':
if state == 'TO':
newFile.writelines(lineTemp)
else:
linhaTemp = []
i = i+1
newFile.close()
Suggestions...
Thanks to all!

Using Functions In Python

i needed to create a program that would read a text file and count the number of lines, words and characters. I got it all working below if seperated individually but i wanted to convert it into using functions so it can read the file once, but i keep getting different answers and unsure what im doing wrong.
Words Code
print ' '
fname = "question2.txt"
infile = open ( fname, 'r' )
fcontents = infile.read()
words = fcontents.split()
cwords = len(words)
print "Words: ",cwords
Characters Code
fname = "question2.txt"
infile = open ( fname, 'r' )
fcontents = infile.read()
char = len(fcontents)
print "Characters: ", char
Lines Code
fname = "question2.txt"
infile = open ( fname, 'r' )
fcontents = infile.readlines()
lines = len(fcontents)
print "Lines: ", lines
Correct Results
Words: 87
Characters: 559
Lines: 12
This is what I came up while trying to use functions but just cant figure out what's wrong.
def filereader():
fname = 'question2.txt'
infile = open ( fname, 'r' )
fcontents = infile.read()
fcontents2 = infile.readlines()
return fname, infile, fcontents, fcontents2
def wordcount(fcontents):
words = fcontents.split(fcontents)
cwords = len(words)
return cwords
def charcount(fcontents):
char = len(fcontents)
return char
def linecount(fcontents2):
lines = len(fcontents2)
return lines
def main():
print "Words: ", wordcount ('cwords')
print "Character: ", charcount ('char')
print "Lines: ", linecount ('lines')
main()
Wrong Results
Words: 2
Character: 4
Lines: 5
You need to use filereader in main:
def main():
fname, infile, fcontents, fcontents2 = filereader()
print "Words: ", wordcount (fcontents)
print "Character: ", charcount (fcontents)
print "Lines: ", linecount (fcontents2)
Otherwise, how would you obtain the values for fcontents and fcontents2 to pass to your other functions? You also need to fix filereader to make sure it will read the file once:
def filereader():
fname = 'question2.txt'
infile = open ( fname, 'r' )
fcontents = infile.read()
fcontents2 = fcontents.splitlines(True)
return fname, infile, fcontents, fcontents2
Note that the line for fcontents2 has been modified to split fcontents on newlines (see str.splitlines). This will also gives you a list of strings as .readlines() would do.
infile = open ( fname, 'r' )
fcontents = infile.read()
fcontents2 = infile.readlines()
You cannot read from a file twice.
When you read from a file, the file handle remembers its position in the file. Thus, after your call infile.read(), infile will be placed at the end of the file. When you then call infile.readlines(), it will try to read all the characters between its current position and the end of the file, and hence return an empty list.
You can rewind the file to its initial position using infile.seek(0). Thus:
>>> fcontents = infile.read()
>>> infile.seek(0)
>>> flines = infile.readlines()
will work.
Alternatively, having read the file into the string fcontents, you can split the string into lines using splitlines:
>>> fcontents = infile.read()
>>> flines = fcontents.splitlines()

Categories