How can we strip spaces before writing into csv - python

! # $ % & ( ) * , - 0 / : < = > ? # [ \ ] ^
this the header of my csv file.. after : you can see one blank space like my csv file header also contain one column with header blank.how can remove by updating following code??
feature_list = ""
root_flag = 'false'
fvt_length = 0
output_file="/home/user/Project/Dataset/unigram_FVT.csv"
feature_vector_file1 = "/home/user/Project/Dataset/BST_unigram.txt"
d = os.path.dirname(output_file)
if not os.path.exists(d):
os.makedirs(d)
with open(output_file, "w" ) as fout:
fp_feature = csv.writer(fout)
fileread=open(feature_vector_file1,"r")
read_list=fileread.read()
read_list=dataPreprocessing.remove_words_less_than_3(read_list)
read_list = read_list.replace('\n','')
read_list = re.sub( '\s+', ' ',read_list).strip()
read_list = dataPreprocessing.remove_digits(read_list)
unigram_list=list(set(read_list.split(" ")))
for i in range(0,len(unigram_list)):
unigram_list[i]=unigram_list[i].lstrip().rstrip()
if root_flag == 'false' :
root = Node(unigram_list[i])
root_flag = 'true'
else :
root.insert(unigram_list[i])
feature_list = feature_list + "\n"+unigram_list[i]
feature_list1 = feature_list.strip()
line = feature_list1.split('\n')
line.sort()
line.append("Gender")
root.print_tree()
print len(line)
fp_feature.writerow(line)
FVT_unigram()
Can anybody can help me? Sometimes my file content contains some spaces but I have added this unigram_list[i]=unigram_list[i].lstrip().rstrip() but still my header contains spaces.

I ran into a similar problem with my program the other day, and I realized the easiest thing to do is ato write a simple if statement and just create a new string/list:
aStr = "Hello World this is a test!"
newStr = ""
for letter in aStr:
if letter!=" ":
newStr += letter
And when I print the newStr:
HelloWorldthisisatest!

Related

Python string formatting creating a random new line on print statement

Here is my function. Trying to get this all to print to one line.
Here is the output ->
config::$var['pdf']['meta']['staff_member_name']
= ";"
The = ";" portion of the string prints to a new line in the console for some reason?
This is totally just a personal hack to help with a repetitious job requirement so i'm not looking for anything fancy.
Here is my function ->
def auto_pdf_config(file):
with open(file) as f:
content = f.readlines()
kill = " = array("
start = "config::$var['intake']"
new_line = ""
for line in content:
if kill not in line:
pass
elif start in line:
new_line = line
x = new_line.replace(kill, "")
y = x.replace(start,"")
pdf_end = ' = ";" '
z = "config::$var['pdf']['meta']{}{}".format(y,pdf_end)
print(z)
it seems you "y" variable has new line in it. you can try to strip it off.
y = x.replace(start,"").strip('\n')
Since x = new_line.replace(kill, ""), y = x.replace(start,""), and new_line is the line of content, it contains endline symbol (\n), that's why this endline symbol is appended before pdf_end. You just need to remove endline symbol from y.
You can do something like that:
y = y.strip('\n')

Extracting data from a text file to an output file

I have alot of files which names are just number. (Starting from 1 to whatever is the maximum number) and each of these files are similar to each other by their "tags" (ObjectID =, X =, Y =, etc.), but the values after those tags are not the same at all.
I wanted to make my job easier from manually copy/pasting the data from one file to another and made a small script using Python (since I am slightly experienced in it).
This is the full script:
import os
BASE_DIRECTORY = 'C:\Users\Tom\Desktop\TheServer\scriptfiles\Objects'
output_file = open('output.txt', 'w')
output = {}
file_list = []
for (dirpath, dirnames, filenames) in os.walk(BASE_DIRECTORY):
for f in filenames:
if 'txt' in str(f):
e = os.path.join(str(dirpath), str(f))
file_list.append(e)
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
if 'ObjectID =' in line:
output[f].append(line)
elif 'X =' in line:
output[f].append(line)
elif 'Y =' in line:
output[f].append(line)
tabs = []
for tab in output:
tabs.append(tab)
tabs.sort()
for tab in tabs:
for row in output[tab]:
output_file.write(row + '')
Now, everything is working fine, the output file looks like this:
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
etc.
But I don't want it to be like that (each value has a new line). I need it to do this for every file:
Read the file.
Remove the tags infront of the values.
Format a single line which will have those values in the output folder. (Let's say I want to make it look like this: "(1216,-1480.500610,2522.842285)" )
Write that line in the output folder.
Repeat for every file.
Any help please?
Hope this helps.
data = open('sam.txt', 'r').read()
>>> print data
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
>>>
Now lets do some string replacements :)
>>> data = data.replace('ObjectID =', '').replace('\nX = ', ',').replace('\nY = ', ',')
>>> print data
1216,-1480.500610,2610.885742
970,-1517.210693,2522.842285
3802,-1512.156616,2521.116210
In your loop, keep track of whether you are 'in' a record:
records = []
in_record = False
id, x, y = 0, 0, 0
for line in txtfile:
if not in_record:
if 'ObjectID =' in line:
in_record = True
id = line[10:]
elif 'X =' in line:
x = line[3:]
elif 'Y =' in line:
y = line[3:]
records.append((id, x, y))
in_record = False
Then you'll have a list of tuples which you can easily write with the csv module.
Find here a version of the loop you have generating the contents.
I rewrote it so the line contents ObjectId, X and Y are in the same line.
It looks that is what you want to do:
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
myline = ''
if 'ObjectID =' in line:
pos = line.rfind("ObjectID =") + len("ObjectID =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'X =' in line:
pos = line.rfind("X =") + len("X =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'Y =' in line:
pos = line.rfind("Y =") + len("Y =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
output[f].append(myline)
Note that you need to know which character (in the code the delimiter) separates the names you try to find: ObjectID = from the actual values you want to grab from the line.
Here is what you need. I did not have enough time to write the code for appending the result to a new file. Instead it just prints it, but you get the point.
import os.path
path = "path"
#getting the number of files in your folder
num_files = len([f for f in os.listdir(path)
if os.path.isfile(os.path.join(path, f))])
#function that returns your desired output for a given file
def file_head_ext(file_path, file_num):
with open(file_path + "/" + file_num) as myfile:
head = [next(myfile).split("=") for x in range(3)]
formatted_head = [elm[1].replace("\n",'').replace(" ","") for elm in head]
return(",".join(formatted_head))
for filnum in range(1,num_files):
print(file_head_ext(path, str(filnum)))

Python - searching if string is in file

I want to search for string in file and if there is string make action and if there isn´t string make other action, but from this code:
itcontains = self.textCtrl2.GetValue()
self.textCtrl.AppendText("\nTY: " + itcontains)
self.textCtrl2.Clear()
pztxtflpath = "TCM/Zoznam.txt"
linenr = 0
with open(pztxtflpath) as f:
found = False
for line in f:
if re.search("\b{0}\b".format(itcontains),line):
hisanswpath = "TCM/" + itcontains + ".txt"
hisansfl = codecs.open(hisanswpath, "r")
textline = hisansfl.readline()
linenr = 0
ans = ""
while textline <> "":
linenr += 1
textline = hisansfl.readline()
hisansfl.close()
rnd = random.randint(1, linenr) - 1
hisansfl = codecs.open(pztxtflpath, "r")
textline = hisansfl.readline()
linenr = 0
pzd = ""
while linenr <> rnd:
textline = hisansfl.readline()
linenr += 1
ans = textline
hisansfl.close()
self.textCtrl.AppendText("\nTexter: " + ans)
if not found:
self.textCtrl.AppendText("\nTexter: " + itcontains)
wrtnw = codecs.open(pztxtflpath, "a")
wrtnw.write("\n" + itcontains)
wrtnw.close
If there is not that string it is working corectly, but if there is that string, what i am searching for it makes if not found action. I really don´t know how to fix it, i have already try some codes from other sites, but in my code it doesn´t works. Can somebody help please?
Are you saying that the code underneath the following if statement executes if the string contains what you're looking for?
if re.search("\b{0}\b".format(itcontains),line):
If so, then you just need to add the following to the code block underneath this statement:
found = True
This will keep your if not found clause from running. If the string you are looking for should only be found once, I would also add a break statement to your first statement to break out of the loop.

Pick parts from a txt file and copy to another file with python

I'm in trouble here. I need to read a file. Txt file that contains a sequence of records, check the records that I want to copy them to a new file.
The file content is like this (this is just an example, the original file has more than 30 000 lines):
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|316 #begin register
03000|SP|467
99999|33|130 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
The records that begin with 03000 and have the characters 'TO' must be written to a new file. Based on the example, the file should look like this:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Code:
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.read()
file.close()
# here I need to check if the record exists 03000 characters 'TO', if it exists, copy the recordset 00000-99999 for the new file.
I did multiple searches and found nothing to help me.
Thank you!
with open("file.txt",'r') as inFile, open("newFile.txt","w") as outFile:
outFile.writelines(line for line in inFile
if line.startswith("03000") and "TO" in line)
If you need the previous and the next line, then you have to iterate inFile in triads. First define:
def gen_triad(lines, prev=None):
after = current = next(lines)
for after in lines:
yield prev, current, after
prev, current = current, after
And then do like before:
outFile.writelines(''.join(triad) for triad in gen_triad(inFile)
if triad[1].startswith("03000") and "TO" in triad[1])
import re
pat = ('^00000\|\d+\|\d+.*\n'
'^03000\|TO\|\d+.*\n'
'^99999\|\d+\|\d+.*\n'
'|'
'^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*')
rag = re.compile(pat,re.MULTILINE)
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(rag.findall(f.read())))
For files with additional lines between lines beginning with 00000, 03000 and 99999, I didn't find simpler code than this one:
import re
pat = ('(^00000\|\d+\|\d+.*\n'
'(?:.*\n)+?'
'^99999\|\d+\|\d+.*\n)'
'|'
'(^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*)')
rag = re.compile(pat,re.MULTILINE)
pit = ('^00000\|.+?^03000\|TO\|\d+.+?^99999\|')
rig = re.compile(pit,re.DOTALL|re.MULTILINE)
def yi(text):
for g1,g2 in rag.findall(text):
if g2:
yield g2
elif rig.match(g1):
yield g1
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(yi(f.read())))
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.readlines()
file.close()
newFile.writelines(filter(lambda x:x.startswith("03000") and "TO" in x,content))
This seems to work. The other answers seem to only be writing out records that contain '03000|TO|' but you have to write out the record before and after that as well.
import sys
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# import file
file_name = sys.argv[1]
file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name
file = open(file_path,"r")
# ---------------------------------------------------------------
# create output files
output_file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name + '.out'
output_file = open(output_file_path,"w")
# create output files
# ---------------------------------------------------------------
# process file
temp = ''
temp_out = ''
good_write = False
bad_write = False
for line in file:
if line[:5] == 'AAAAA':
temp_out += line
elif line[:5] == 'ZZZZZ':
temp_out += line
elif good_write:
temp += line
temp_out += temp
temp = ''
good_write = False
elif bad_write:
bad_write = False
temp = ''
elif line[:5] == '03000':
if line[6:8] != 'TO':
temp = ''
bad_write = True
else:
good_write = True
temp += line
temp_out += temp
temp = ''
else:
temp += line
output_file.write(temp_out)
output_file.close()
file.close()
Output:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Does it have to be python? These shell commands would do the same thing in a pinch.
head -1 inputfile.txt > outputfile.txt
grep -C 1 "03000|TO" inputfile.txt >> outputfile.txt
tail -1 inputfile.txt >> outputfile.txt
# Whenever I have to parse text files I prefer to use regular expressions
# You can also customize the matching criteria if you want to
import re
what_is_being_searched = re.compile("^03000.*TO")
# don't use "file" as a variable name since it is (was?) a builtin
# function
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
for this_line in source_file:
if what_is_being_searched.match(this_line):
destination_file.write(this_line)
and for those who prefer a more compact representation:
import re
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
destination_file.writelines(this_line for this_line in source_file
if re.match("^03000.*TO", this_line))
code:
fileName = '1'
fil = open(fileName,'r')
import string
##step 1: parse the file.
parsedFile = []
for i in fil:
##tuple1 = (1,2,3)
firstPipe = i.find('|')
secondPipe = i.find('|',firstPipe+1)
tuple1 = (i[:firstPipe],\
i[firstPipe+1:secondPipe],\
i[secondPipe+1:i.find('\n')])
parsedFile.append(tuple1)
fil.close()
##search criterias:
searchFirst = '03000'
searchString = 'TO' ##can be changed if and when required
##step 2: used the parsed contents to write the new file
filout = open('newFile','w')
stringToWrite = parsedFile[0][0] + '|' + parsedFile[0][1] + '|' + parsedFile[0][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
for i in range(1,len(parsedFile)):
if parsedFile[i][1] == searchString and parsedFile[i][0] == searchFirst:
for j in range(-1,2,1):
stringToWrite = parsedFile[i+j][0] + '|' + parsedFile[i+j][1] + '|' + parsedFile[i+j][2] + '\n'
filout.write(stringToWrite)
stringToWrite = parsedFile[-1][0] + '|' + parsedFile[-1][1] + '|' + parsedFile[-1][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
filout.close()
I know that this solution may be a bit long. But it is quite easy to understand. And it seems an intuitive way to do it. And I have already checked this with the Data that you have provided and it works perfectly.
Please tell me if you need some more explanation on the code. I will definitely add the same.
I tip (Beasley and Joran elyase) very interesting, but it only allows to get the contents of the line 03000. I would like to get the contents of the lines 00000 to line 99999.
I even managed to do here, but I am not satisfied, I wanted to make a more cleaner.
See how I did:
file = open(url,'r')
newFile = open("newFile.txt",'w')
lines = file.readlines()
file.close()
i = 0
lineTemp = []
for line in lines:
lineTemp.append(line)
if line[0:5] == '03000':
state = line[21:23]
if line[0:5] == '99999':
if state == 'TO':
newFile.writelines(lineTemp)
else:
linhaTemp = []
i = i+1
newFile.close()
Suggestions...
Thanks to all!

Python CSV module, add column to the side, not the bottom

I am new in python, and I need some help. I made a python script that takes two columns from a file and copies them into a "new file". However, every now and then I need to add columns to the "new file". I need to add the columns on the side, not the bottom. My script adds them to the bottom. Someone suggested using CSV, and I read about it, but I can't make it in a way that it adds the new column to the side of the previous columns. Any help is highly appreciated.
Here is the code that I wrote:
import sys
import re
filetoread = sys.argv[1]
filetowrite = sys.argv[2]
newfile = str(filetowrite) + ".txt"
openold = open(filetoread,"r")
opennew = open(newfile,"a")
rline = openold.readlines()
number = int(len(rline))
start = 0
for i in range (len(rline)) :
if "2theta" in rline[i] :
start = i
for line in rline[start + 1 : number] :
words = line.split()
word1 = words[1]
word2 = words[2]
opennew.write (word1 + " " + word2 + "\n")
openold.close()
opennew.close()
Here is the second code I wrote, using CSV:
import sys
import re
import csv
filetoread = sys.argv[1]
filetowrite = sys.argv[2]
newfile = str(filetowrite) + ".txt"
openold = open(filetoread,"r")
rline = openold.readlines()
number = int(len(rline))
start = 0
for i in range (len(rline)) :
if "2theta" in rline[i] :
start = i
words1 = []
words2 = []
for line in rline[start + 1 : number] :
words = line.split()
word1 = words[1]
word2 = words[2]
words1.append([word1])
words2.append([word2])
with open(newfile, 'wb') as file:
writer = csv.writer(file, delimiter= "\n")
writer.writerow(words1)
writer.writerow(words2)
These are some samples of input files:
https://dl.dropbox.com/u/63216126/file5.txt
https://dl.dropbox.com/u/63216126/file6.txt
My first script works "almost" great, except that it writes the new columns at the bottom and I need them at side of the previous columns.
The proper way to use writerow is to give it a single list that contains the data for all the columns.
words.append(word1)
words.append(word2)
writer.writerow(words)

Categories