Writing processed data into excel using CSV Python - python

I'm trying to write some data into the excel spreadsheet using CSV.
I'm writing a motif finder, reading the input from fasta and outputs to excel.
But I'm having a hard time writing the data in a correct format.
My desired result in the excel is like below..
SeqName M1 Hits M2 Hits
Seq1 MN[A-Z] 3 V[A-Z]R[ML] 2
Seq2 MN[A-Z] 0 V[A-Z]R[ML] 5
Seq3 MN[A-Z] 1 V[A-Z]R[ML] 0
I have generated correct results but I just don't know how to put them in a correct format like above.
This is the code that I have so far.
import re
from Bio import SeqIO
import csv
import collections
def SearchMotif(f1, motif, f2="motifs.xls"):
with open(f1, 'r') as fin, open(f2,'wb') as fout:
# This makes SeqName static and everything else mutable thus, when more than 1 motifs are searched,
# they can be correctly placed into excel.
writer = csv.writer(fout, delimiter = '\t')
motif_fieldnames = ['SeqName']
writer_dict = csv.DictWriter(fout,delimiter = '\t' ,fieldnames=motif_fieldnames)
for i in range(0,len(motif),1):
motif_fieldnames.append('M%d' %(i+1))
motif_fieldnames.append('Hits')
writer_dict.writeheader()
# Reading input fasta file for processing.
fasta_name = []
for seq_record in SeqIO.parse(f1,'fasta'):
sequence = repr(seq_record.seq) # re-module only takes string
fasta_name.append(seq_record.name)
print sequence **********
for j in motif:
motif_name = j
print motif_name **********
number_count = len(re.findall(j,sequence))
print number_count **********
writer.writerow([motif_name])
for i in fasta_name:
writer.writerow([i]) # [] makes it fit into one column instead of characters taking each columns
The print statement that have the asterisks ********** generates this...where number is the number of Hits and difference sequences are seq1, seq2 ...and so on.
Seq('QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQ...LTS', SingleLetterAlphabet())
PA[A-Z]
0
Y[A-Z]L[A-Z]
0
Seq('SFNVATLPAESSSTDLDTTVLLPDEPAEVSDLERIETEWTNMKILELPFAPQMK...VSS', SingleLetterAlphabet())
PA[A-Z]
2
Y[A-Z]L[A-Z]
0
Seq('PAESIYFKIEKTYNLT', SingleLetterAlphabet())
PA[A-Z]
1
Y[A-Z]L[A-Z]
1

You can write your data to a Pandas DataFrame, and then use the DataFrame's to_csv method to export it to a CSV. There is also a to_excel method. Pandas won't let you have multiple columns with the same name, like your "Hits" column. However, you can work around that by putting the column names you want in the first row and using the header=False option when you export.
"import pandas as pd", then replace your code starting at "fasta_name = []" with this:
column_names = ['SeqName']
for i, m in enumerate(motif):
column_names += ['M'+str(i), 'Hits'+str(i)]
df = pd.DataFrame(columns=column_names)
for row, seq_record in enumerate(SeqIO.parse(f1, 'fasta')):
sequence = repr(seq_record.name)
df.loc[row, 'SeqName'] = sequence
for i, j in enumerate(motif):
df.loc[row, 'M'+str(i)] = j
df.loc[row, 'Hits'+str(i)] = len(re.findall(j, sequence))
df.to_csv(index=False)

Related

Is there a function to concatenate two header rows into one?

Consider the following textfile excerpt
Distance,Velocity,Time
(m),(m/s),(s)
1,1,1
2,1,2
3,1,3
I want it to be transformed into this:
Distance(m),Velocity(m/s),Time(s)
1,1,1
2,1,2
3,1,3
In other words, I want to concatenate rows that contains text, and I want them to be concatenated column-wise.
I am initially manipulating a textfile that's generated from a software. I have successfully transformed it down to only numeric columns and their headers, in a csv format. But I have multiple headers for each column. And I need all the information in each header row, because the column attributes will differ from file to file. How can I do this in a smart way in python?
edit: Thank you for your suggestions, it helped me a lot. I used Daweos solution, and added dynamic row count because number of header rows may differ from 2 to 7, depending on the generated output. Here's the code snippet i ended up with.
# Get column headers
a = 0
header_rows= 0
with open(full,"r") as input:
Lines= ""
for line in input:
l = line
g = re.sub(' +',' ',l)
y = re.sub('\t',',',g)
numlines += 1
if len(l.encode('ANSI')) > 250:
# finds header start row
a += 1
if a>0:
# finds header end row
if "---" in line:
header_rows = numlines - (numlines-a+1)
break
else:
# Lines is my headers string
Lines = Lines + "%s" % (y) + ' '
output.close()
# Create concatenated column headers
rows = [i.split(',') for i in Lines.rstrip().split('\n')]
cols = [list(c) for c in zip(*rows)]
for i in (cols):
for j in (rows):
newcolz = [list(c) for c in zip(*rows)]
print(newcolz)
I would do it following way:
txt = " Distance,Velocity,Time \n (m),(m/s),(s) \n 1,1,1 \n 2,1,2 \n 3,1,3 \n "
rows = [i.split(',') for i in txt.rstrip().split('\n')]
cols = [list(c) for c in zip(*rows)]
newcols = [[i[0]+i[1],*i[2:]] for i in cols]
newrows = [','.join(i) for i in zip(*newcols)]
print(newtxt)
Output:
Distance (m),Velocity(m/s),Time (s)
1,1,1
2,1,2
3,1,3
Crucial here is usage of zip to transpose your data, so I can deal with columns rather than rows. [[i[0]+i[1],*i[2:]] for i in cols] is responsible for actual concat, so if you would have headers spanning 3 lines you can do [[i[0]+i[1]+i[2],*i[3:]] for i in cols] and so on.
I am not aware of anything that exists to do this so instaed you can just write a custom function. In the example below the function takes to strings and also a separator which defaults to ,.
It will split each string into a list then use list comprehension using zip to pair up the lists. and then joining the pairs.
Lastly it will join the consolidated headers again with the separator.
def concat_headers(header1, header2, seperator=","):
headers1 = header1.split(seperator)
headers2 = header2.split(seperator)
consolidated_headers = ["".join(values) for values in zip(headers1, headers2)]
return seperator.join(consolidated_headers)
data = """Distance,Velocity,Time\n(m),(m/s),(s)\n1,1,1\n2,1,2\n3,1,3\n"""
header1, header2, *lines = data.splitlines()
consolidated_headers = concat_headers(header1, header2)
print(consolidated_headers)
print("\n".join(lines))
OUTPUT
Distance(m),Velocity(m/s),Time(s)
1,1,1
2,1,2
3,1,3
You don't really need a function to do it because it can be done like this using the csv module:
import csv
data_filename = 'position_data.csv'
new_filename = 'new_position_data.csv'
with open(data_filename, 'r', newline='') as inp, \
open(new_filename, 'w', newline='') as outp:
reader, writer = csv.reader(inp), csv.writer(outp)
row1, row2 = next(reader), next(reader)
new_header = [a+b for a,b in zip(row1, row2)]
writer.writerow(new_header)
# Copy the rest of the input file.
for row in reader:
writer.writerow(row)

Reading CSV file from stdin in Python and modifying it

I need to read csv file from stdin and output the rows only the rows which values are equal to those specified in the columns. My input is like this:
2
Kashiwa
Name,Campus,LabName
Shinichi MORISHITA,Kashiwa,Laboratory of Omics
Kenta Naai,Shirogane,Laboratory of Functional Analysis in Silico
Kiyoshi ASAI,Kashiwa,Laboratory of Genome Informatics
Yukihide Tomari,Yayoi,Laboratory of RNA Function
My output should be like this:
Name,Campus,LabName
Shinichi MORISHITA,Kashiwa,Laboratory of Omics
Kiyoshi ASAI,Kashiwa,Laboratory of Genome Informatics
I need to sort out the people whose values in column#2 == Kashiwa and not output first 2 lines of stdin in stdout.
So far I just tried to read from stdin into csv but I am getting each row as a list of strings (as expected from csv documentation). Can I change this?
#!usr/bin/env python3
import sys
import csv
data = sys.stdin.readlines()
for line in csv.reader(data):
print(line)
Output:
['2']
['Kashiwa']
['Name', 'Campus', 'LabName']
['Shinichi MORISHITA', 'Kashiwa', 'Laboratory of Omics']
['Kenta Naai', 'Shirogane', 'Laboratory of Functional Analysis in
Silico']
['Kiyoshi ASAI', 'Kashiwa', 'Laboratory of Genome Informatics']
['Yukihide Tomari', 'Yayoi', 'Laboratory of RNA Function']
Can someone give me some advice on reading stdin into CSV and manipulating the data later (outputting only needed values of columns, swapping the columns, etc.,)?
#!usr/bin/env python3
import sys
import csv
data = sys.stdin.readlines() # to read the file
column_to_be_matched = int(data.pop(0)) # to get the column number to match
word_to_be_matched = data.pop(0) # to get the word to be matched in said column
col_headers = data.pop(0) # to get the column names
print(", ".join(col_headers)) # to print the column names
for line in csv.reader(data):
if line[column_to_be_matched-1] == word_to_be_matched: #while it matched
print(", ".join(line)) #print it
Use Pandas to read your and manage your data in a DataFrame
import pandas as pd
# File location
infile = r'path/file'
# Load file and skip first two rows
df = pd.read_csv(infile, skiprows=2)
# Refresh your Dataframe en throw out the rows that contain Kashiwa in the campus column
df = df[df['campus'] != 'Kashiwa']
You can perform all kinds edits for example sort your DataFrame simply by:
df.sort(columns='your column')
Check the Pandas documentation for all the possibilities.
This is one approach.
Ex:
import csv
with open(filename) as csv_file:
reader = csv.reader(csv_file)
next(reader) #Skip First Line
next(reader) #Skip Second Line
print(next(reader)) #print Header
for row in reader:
if row[1] == 'Kashiwa': #Filter By 'Kashiwa'
print(row)
Output:
['Name', 'Campus', 'LabName']
['Shinichi MORISHITA', 'Kashiwa', 'Laboratory of Omics']
['Kiyoshi ASAI', 'Kashiwa', 'Laboratory of Genome Informatics']
import csv, sys
f= sys.stdin.readline()
data = csv.reader(f)
out = []
data_lines = list(data)
for line in data_lines[2:5]:#u can increase index to match urs
if line[1] == 'kashiwa':
new = [line[0], line[1], line[2]]#u can use string instead if list
string = f"{line[0]},{line[1]},{line[2]}"
#print(string)#print does same as stdout u can use dis
sys.stdout.write(string+'\n')
out.append(new)
sys.stdout.write(str(out))#same thing dat happens in print in the background#it out puts it as a list after the string repr
#print(out)#u can use dis too instead of stdout
f.close()

Python: Extracting specific rows from csv as list

Probably a repost but I can't find a solution that works in my case.
I have a .csv file with an id number associated to a string like this:
0 | ABC
1 | DEF
...
100 | XYZ
I would like to extract the row with an ID number x and append it to a list, so ideally something like:
with open('myfile.csv', 'rb') as f:
reader = csv.reader(f)
results.append([row for idx, row in enumerate(reader) if idx[0] == x)])
this solution does not appear to work as it tells me that the "iterator should return strings, not bytes" despite the fact that I though I opened it in byte mode
use pandas to read the csv-file into a dataframe
import pandas as pd
sourceinput = pd.read_csv('myfile.csv')
outputlist = sourceinput['id'].loc[sourceinput['id'] == <value_you_need>].tolist()

Detect string on CSV data array and add 2 cells before

I'm so confused with a csv data treatment, any help will be nice.
I've a csv file with multiple columns like this:
col_1;col_2;col_3;col_4;col_5;col_6;col_7;col_8;
Object1;123;Something;456;Something2;0;0;someword;789
Object2;123;Something;456;Something2;0;0;someword;789
Object3;123;Something;456;Something2;0;0;someword;789
Object4;123;Something;456;Something2;0;0;someword;789
But some Objects have a missing data on col_6, col_7 and col_8, instead of it there's a Keyword in col_6 like this:
col_1;col_2;col_3;col_4;col_5;col_6;col_7;col_8;
Object1;123;Something;456;Something2;0;0;someword;789
Object2;123;Something;456;Something2;0;0;someword;789
Object3;123;Something;456;Something2;Keyword;789;Object4;123
Something;456;Something2;0;0
I've detected how many lines got those Keywords and the number of the row:
import csv
class FixIt:
def test(self):
count = 0
with open('input.csv',mode='r') as file
read = csv.reader(file)
for num,row in enumerate(reader):
count+=1
if 'Keyword' in row[0]:
print num, row
count+=1
print(count)
TryIt = FixIt()
TryIt.test()
I need to put x2 zeros or somestring values on the cells before the keyword to re-order the output to the original structure like:
col_1;col_2;col_3;col_4;col_5;col_6;col_7;col_8;col_9
Object1;123;Something;456;Something2;0;0;someword;789
Object2;123;Something;456;Something2;0;0;someword;789
Object3;123;Something;456;Something2;corrective_data;corrective_data;Keyword;789
Object4;123;Something;456;Something2;0;0;someword;789
Maybe with pandas can be done but i don't know where or how to start, some orientation or answer will be kindly appreciated.
Try 1:
I've tryed to replace the string Keyword on each line by 0;0;Keyword with:
with open("input.csv", "r") as file_input:
with open("output.csv", "w") as file_output:
for line in file_input:
file_output.write(line.replace('Keyword','0;0;Keyword'))
But the result is wrong, it adds a ";" inside every cell and puts the string ";"0;0;Keyword also. After seeing the file with vim i saw the fact that i'll need also to add a new row after the 789 (because i see a " " as breakline).
I'm so lost right now, maybe creating 1 object and a list of properties for every row will be better (?).
Not sure if this is what you want, because your data in second code cell is not correctly formatted. I assume you want to make following changes:
col_1;col_2;col_3;col_4;col_5;col_6;col_7;col_8;col_9
Object3;123;Something;456;Something2;0;0;someword;789
Object4;123;Something;456;Something2;Keyword;231
# TO #
col_1;col_2;col_3;col_4;col_5;col_6;col_7;col_8;col_9
Object3;123;Something;456;Something2;0;0;someword;789
Object3;123;Something;456;Something2;0;0;Keyword;231
So here is how you can make the changes with pandas:
import pandas as pd
# input data from csv file
data = pd.read_csv("input.csv", delimiter=';')
# get the indices of rows with "Keyword" appearing in col_6
idxs = data.loc[data['col_6'] == "Keyword"].index
# copy value in col_6 to col_8
data.set_value(idxs, 'col_8', data.iloc[idxs]['col_6'])
# copy value in col_7 to col_9
data.set_value(idxs, 'col_9', data.iloc[idxs]['col_7'])
data.set_value(idxs, 'col_6', 0) # fill col_6 with 0
data.set_value(idxs, 'col_7', 0) # fill col_7 with 0
# write result to a new file
data.to_csv("result.csv", sep=';')

Python: Pandas, dealing with spaced column names

If I have multiple text files that I need to parse that look like so, but can vary in terms of column names, and the length of the hashtags above:
How would I go about turning this into a pandas dataframe? I've tried using pd.read_table('file.txt', delim_whitespace = True, skiprows = 14), but it has all sorts of problems. My issues are...
All the text, asterisks, and pounds at the top needs to be ignored, but I can't just use skip rows because the size of all the junk up top can vary in length in another file.
The columns "stat (+/-)" and "syst (+/-)" are seen as 4 columns because of the whitespace.
The one pound sign is included in the column names, and I don't want that. I can't just assign the column names manually because they vary from text file to text file.
Any help is much obliged, I'm just not really sure where to go from after I read the file using pandas.
Consider reading in raw file, cleaning it line by line while writing to a new file using csv module. Regex is used to identify column headers using the i as match criteria. Below assumes more than one space separates columns:
import os
import csv, re
import pandas as pd
rawfile = "path/To/RawText.txt"
tempfile = "path/To/TempText.txt"
with open(tempfile, 'w', newline='') as output_file:
writer = csv.writer(output_file)
with open(rawfile, 'r') as data_file:
for line in data_file:
if re.match('^.*i', line): # KEEP COLUMN HEADER ROW
line = line.replace('\n', '')
row = line.split(" ")
writer.writerow(row)
elif line.startswith('#') == False: # REMOVE HASHTAG LINES
line = line.replace('\n', '')
row = line.split(" ")
writer.writerow(row)
df = pd.read_csv(tempfile) # IMPORT TEMP FILE
df.columns = [c.replace('# ', '') for c in df.columns] # REMOVE '#' IN COL NAMES
os.remove(tempfile) # DELETE TEMP FILE
This is the way I'm mentioning in the comment: it uses a file object to skip the custom dirty data you need to skip at the beginning. You land the file offset at the appropriate location in the file where read_fwf simply does the job:
with open(rawfile, 'r') as data_file:
while(data_file.read(1)=='#'):
last_pound_pos = data_file.tell()
data_file.readline()
data_file.seek(last_pound_pos)
df = pd.read_fwf(data_file)
df
Out[88]:
i mult stat (+/-) syst (+/-) Q2 x x.1 Php
0 0 0.322541 0.018731 0.026681 1.250269 0.037525 0.148981 0.104192
1 1 0.667686 0.023593 0.033163 1.250269 0.037525 0.150414 0.211203
2 2 0.766044 0.022712 0.037836 1.250269 0.037525 0.149641 0.316589
3 3 0.668402 0.024219 0.031938 1.250269 0.037525 0.148027 0.415451
4 4 0.423496 0.020548 0.018001 1.250269 0.037525 0.154227 0.557743
5 5 0.237175 0.023561 0.007481 1.250269 0.037525 0.159904 0.750544

Categories