Reading File into Python Dictionary and then Writing to File - python

I have a a multi-row, multi-column text file. Has a header if that's important. Want to read all the data into a Python dictionary and then just write it out again in a file. I ultimately want to create two dictionaries from two file, join them on the key, and then print out the joined version but I can't even get this part right. This is what I got:
import sys
import csv
from collections import defaultdict
usage = "usage: python Newer.py <project_file> <table_file> <outfile>"
if len(sys.argv) != 4:
print usage
sys.exit(0)
project = open(sys.argv[1], "rb")
table = open(sys.argv[2], "rb")
outfile = open(sys.argv[3], "w")
projectdict = defaultdict(list)
for line in project:
parts = line.strip().split("\t")
first = parts[1]
projectdict[first].append(line)
for key in projectdict:
outfile.write(" ".join(projectdict[first]) + "\n")
And what I get from it is a text file with the same entry from the text file repeated over and over again.

The issue is that you're writing the value at key "first" every time in the second loop, regardless of what the actual key is. I believe this will fix your problem.
for key in projectdict:
outfile.write(" ".join(projectdict[key]) + "\n")

Related

Merging 2 json files

I'm trying to merge both json files but I'm trying to append timestamp from file2 to corresponding frame number in file1.please guide.
JSON_FILE1
{"frameNumber":1,"classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":true,"bbox":{"top":157,"left":581,"height":390,"width":297},"classifications":[]}]}
{"frameNumber":2,"classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":false,"bbox":{"top":157,"left":581,"height":390.36,"width":297.16},"classifications":[]}]}
{"frameNumber":3,"classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":false,"bbox":{"top":157,"left":581,"height":390.72,"width":297.32},"classifications":[]}]}
{"frameNumber":4,"classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":false,"bbox":{"top":157,"left":581,"height":391.08,"width":297.48},"classifications":[]}]}
{"frameNumber":5,"classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":false,"bbox":{"top":157,"left":581,"height":391.44,"width":297.64},"classifications":[]}]}
JSON_FILE2
{
"frame1": "0:0:0:66",
"frame2": "0:0:0:100",
"frame3": "0:0:0:133",
"frame4": "0:0:0:166",
"frame5": "0:0:0:200"
}
expected output:
{"frameNumber":1,"frame1": "0:0:0:66",,"classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":true,"bbox":{"top":157,"left":581,"height":390,"width":297},"classifications":[]}]}
{"frameNumber":2, "frame2": "0:0:0:10,"classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":false,"bbox":{"top":157,"left":581,"height":390.36,"width":297.16},"classifications":[]}]}
{"frameNumber":3,"frame3": "0:0:0:133,"classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":false,"bbox":{"top":157,"left":581,"height":390.72,"width":297.32},"classifications":[]}]}
{"frameNumber":4,"frame4": "0:0:0:166","classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":false,"bbox":{"top":157,"left":581,"height":391.08,"width":297.48},"classifications":[]}]}
{"frameNumber":5,"frame5": "0:0:0:200","classifications":[],"objects":[{"featureId":"ckotybs4v00033b68edh8a6o5","schemaId":"ckoto8fzm16gj0y7uesrd0nzt","title":"Person 1","value":"person_1","color":"#1CE6FF","keyframe":false,"bbox":{"top":157,"left":581,"height":391.44,"width":297.64},"classification
I tried this way but I am unable to achieve.
import json
import glob
result = []
for f in glob.glob("*.json"):
with open(f,"rb") as infile:
result.append(json.load(infile))
with open("merged_file.json","wb") as outfile:
json.dump(result,outfile)
A correct .json needs a pair of [] and than you could json.load it, iterate over ever line and do the same like below but anyway:
The easiest solution is turn every line in a dict, if the framenumber matches add the timestamp and write it back.
def fuse(file1, file2, nTargetPath):
with open(nTargetPath, "wb") as tTargetFile:
with open(file1, "rb") as tSourceFileA:
for tLineA in tSourceFileA.readlines():
tDictA = json.loads(tLineA) #loads dict from a string
tKey = "frame"+tDictA["frameNumber"] #searching the correct entry but why not name this timestampX
with open(file2, "rb") as tSourceFileB:
for tLineB in tSourceFileB.readlines():
tDictB = json.loads(tLineB )
if tKey in tDictB:
tDictA[tKey] = tDictB[tKey]
break #cause there is only one timestamp
tTargetFile.write(json.dumps(tDictA)+'\n')
This code cann easily updated by improve the file accessing for example when you know the key for the timestamp in file2 is everytime in the same row as in file1 and so on.
As was pointed out, one file is ndjson and the other file is json. You need to implement some logic to add the json to the ndjson
# https://pypi.org/project/ndjson/
# pip install ndjson
import ndjson
import json
with open('path/to/file/im_a_ndjson.ndjson') as infile:
ndjson_object = ndjson.load(infile)
with open('path/to/file/json_file2.json') as infile:
dict_object = json.load(infile)
print(type(ndjson_object[0]['frameNumber']))
# output: <class 'int'>
for key in dict_object:
# int needed as you can see above
framenumber = int(key.strip('frame'))
# find the matching ndjson object
for ndjs in ndjson_object:
if ndjs['frameNumber'] == framenumber:
# add the key/value pair
ndjs[key] = dict_object[key]
# we can break as we've found it
break
with open('path/to/file/new_ndjson.ndjson', 'w') as outfile:
ndjson.dump(ndjson_object, outfile)

how to subsample a fasta file based on the headers if headers contain certain strings?

I have a fasta file like this:
>gi|373248686|emb|HE586118.1| Streptomyces albus subsp. albus salinomycin biosynthesis cluster, strain DSM 41398
GGATGCGAAGGACGCGCTGCGCAAGGCGCTGTCGATGGGTGCGGACAAGGGCATCCACGT
CGAGGACGACGATCTGCACGGCACCGACGCCGTGGGTACCTCGCTGGTGCTGGCCAAGGC
>gi|1139489917|gb|KX622588.1| Hyalangium minutum strain DSM 14724 myxochromide D subtype 1 biosynthetic gene cluster and tRNA-Thr gene, complete sequence
ATGCGCAAGCTCGTCATCACGGTGGGGATTCTGGTGGGGTTGGGGCTCGTGGTCCTTTGG
TTCTGGAGCCCGGGAGGCCCAGTCCCCTCCACGGACACGGAGGGGGAAGGGCGGAGTCAG
CGCCGGCAGGCCATGGCCCGGCCCGGCTCCGCGCAGCTGGAGAGTCCCGAGGACATGGGG
>gi|930076459|gb|KR364704.1| Streptomyces sioyaensis strain BCCO10_981 putative annimycin-type biosynthetic gene cluster, partial sequence
GCCGGCAGGTGGGCCGCGGTCAGCTTCAGGACCGTGGCCGTCGCGCCCGCCAGCACCACG
GAGGCCCCCACGGCCAGCGCCGGGCCCGTGCCCGTGCCGTACGCGAGGTCCGTGCTGAAC
and I have a text file containing a list of numbers:
373248686
930076459
296280703
......
I want to do the following:
if the header in the fasta file contains the numbers in the text file:
write all the matches(header+sequence) to a new output.fasta file.
How to do this in python? It seems easy, just some for loops may do the job, but somehow I cannot make that happen, and if my files are really big, loop in another loop may take a long time. Here's what I have tried:
from Bio import SeqIO
import sys
wanted = []
for line in open(sys.argv[2]):
titles = line.strip()
wanted.append(titles)
seqiter = SeqIO.parse(open(sys.argv[1]), 'fasta')
sys.stdout = open('output.fasta', 'w')
new_seq = []
for seq in seqiter:
new_seq.append(seq if i in seq.id for i in wanted)
SeqIO.write(new_seq, sys.stdout, "fasta")
sys.stdout.close()
Got this error:
new_seq.append(seq if i in seq.id for i in wanted)
^
SyntaxError: invalid syntax
Is there a better way to do this?
Thank you!
Use a program like this
from Bio import SeqIO
import sys
# read in the text file
numbersInTxtFile = set()
# hint: use with, then you don't need to
# program file closing. Furhtermore error
# handling is comming along with this too
with open(sys.argv[2], "r") as inF:
for line in inF:
line = line.strip()
if line == "": continue
numbersInTxtFile.add(int(line))
# read in the fasta file
with open(sys.argv[1], "r") as inF:
for record in SeqIO.parse(inF, "fasta"):
# now check if this record in the fasta file
# has an id we are searching for
name = record.description
id = int(name.split("|")[1])
print(id, numbersInTxtFile, id in numbersInTxtFile)
if id in numbersInTxtFile:
# we need to output
print(">%s" % name)
print(record.seq)
which you can then call like so from the commandline
python3 nameOfProg.py inputFastaFile.fa idsToSearch.txt > outputFastaFile.fa
Import your "keeper" IDs into a dictionary rather than a list, this will be much faster as the list doesn't have to be searched thousands of times.
keepers = {}
with open("ids.txt", "r") as id_handle:
for curr_id in id_handle:
keepers[curr_id] = True
A list comprehension generates a list, so you don't need to append to another list.
keeper_seqs = [x for x in seqiter if x.id in keepers]
With larger files you will want to loop over seqiter and write the entries one at a time to avoid memory issues.
You should also never assign to sys.stdout without a good reason, if you want to output to STDOUT just use print or sys.stdout.write().

how can i change the type of storing Dict from row to column using Python

I have a simple question. I am using this code to store dictionary of dictionaries in csv file.
data = dataA,dataB, dataC, dataD
w = csv.writer(open("test.csv", "w"))
for x in data:
for field, possible_values in x.items():
print(field, possible_values)
w.writerow([field, possible_values])
The stored values which I got in csv are stored in rows but I want them to be stored as column.
My actual result in csv:
name: Alex
Old: 22
My target in csv is should be like this:
Name Old
Alex 22
How can I change it?
Update1:
the clue is for key in x.keys(). After many hours of hard work i updated my code like that and it works better than before but i still have a small issue to get a new line at the end of storing all values and keys x.keys() and y.values(). of my dictioanries in csv file
if not os.path.isfile(filename):
outfile = open(filename, "w")
#outfile.write("#Sequence,,,,")
for x in data:
print(x.keys())
for key in x.keys():
print(key)
store_key= key + ","
outfile = open(filename, "a")
outfile.write(store_key)
outfile.close()
for y in data:
print(y.values())
for value in y.values():
print(value)
store_value = value + ","
outfile = open(filename, "a")
outfile.write(store_value)
outfile.close
Now i need to seperate maybe with "\n" between keys and values to get the the values od arays under the line of keys.
Any help yill be appreciated.
If you try to access and write to file, this will slow down your code. Hold the output text in a string, delete last comma(,) and add a newline character to end of the string. It will speed up your code, and new line will be added. You can do same for the second for loop too.
line = ""
for x in data:
for key in x.keys():
line += key + ","
line = line[:-1]+"\n" # Delete last character and add new line
outfile = open(filename, "a")
outfile.write(line)
outfile.close()
You can write a row of the dictionary keys, then a row of the values.
for x in data:
w.writerow(x.keys())
w.writerow(x.values())

Pass to a txt file unique rows

I am trying to pass unique rows to a txt file after doing a web scraping for certain values. So the txt file involves the following:
Current date Amount Gained
15/07/2017 660
16/07/2017 -200
17/07/2017 300
So basically what I want to do is to write a script that only allows unique rows I dont want any duplicates because values change daily. So if a user by accident runs the script two times in one day I dont want a duplicate row in my txt file because it will affect further calculations in my data analysis. So this is the function that I currently have and I will like to know what modifications should I make?
def Cost_Revenues_Difference():
nrevenue = revenue
ndifference = difference
dateoftoday = time.strftime('%d/%m/%Y')
Net_Result.append(nrevenue)
with open('Net_Result.txt', 'a') as ac:
for x in Net_Result:
ac.write('\n' + dateoftoday + ' ' + str(Net_Result))
Cost_Revenues_Difference()
You can read all data of your file into list before:
with open('Net_Result.txt') as f:
content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]
Then check if the line you want to add does not exist in your content list, if not, add that line to file:
newLine = dateoftoday + ' ' + str(Net_Result);
if not newLine in content:
ac.write('\n' + newLine)
If the file is affordable to be loaded into RAM and has the structure you given in the example lines, maybe dump the data as a python object into .pkl. For example:
import pickle
data = {'15/07/2017': 660,
'16/07/2017': -200,
'17/07/2017': 300}
with open('/path/to/the/file.pkl', 'wb') as file:
pickle.dump(data, file)
pickle files are friendly for python objects, you can utilise dictionary object's built-in methods to avoid redundant entries or make updates.
For more complicate structures, take a look at pandas.Dataframes. If your program works with languages other than python, json or xml might be better choices.
There are many ways you can do this. Two alternative ways described below.
1 (this alt updates the value)
One is to put them in a dictionary with key and value in pairs and use the json library to import and export data (benefit: very common data structure).
import json
with open("test.json") as f:
data = json.loads(f.read())
data["18-05-17"] = 123
with open("test.json", "w") as f:
json.dump(data,f,indent=4)
Test.json
{
"18-05-17": 123,
"17-05-17": 123
}
As a dictionary only can hold unique keys you won't have duplicates.
2 (this alt will not update the value)
Another solution that comes in mind is put the current date in the filename:
import datetime
import os
today = datetime.datetime.today().strftime("%y%m%d")
filedate = [i for i in os.listdir() if i.startswith("Net_result")][0]
# If today is different than the filedate continue
if today != os.path.splitext(filedate)[0].split("_")[-1]:
# code here
with open(filedate, "a") as f:
f.write('\n' + dateoftoday + ' ' + str(Net_Result))
# rename
os.rename(filedate,"Net_result_{}.csv".format(today))
You could start with a file with yesterdays date ("Net_result_170716") and the code would check if the file-ending is different from today (which it is) and add new value, rename file and save. Running the code again would not do anything (not even open the file).

String Replace in csv

Below, I am trying to replace data in a csv. The code works, but it replaces anything matching stocklevelin the file.
def updatestocklevel(quantity, stocklevel, code):
newlevel = stocklevel - quantity
stocklevel = str(stocklevel)
newlevel = str(newlevel)
s = open("stockcontrol.csv").read()
s = s.replace (stocklevel ,newlevel) #be careful - will currently replace any number in the file matching stock level!
f = open("stockcontrol.csv", 'w')
f.write(s)
f.close()
My csv looks like this;
34512340,1
12395675,2
56756777,1
90673412,2
12568673,3
22593672,5
65593691,4
98593217,2
98693214,2
98693399,5
11813651,85
98456390,8
98555567,3
98555550,45
98553655,2
96553657,1
91823656,2
99823658,2
Elsewhere in my program, I have a function that searches for the code (8 digits)
Is it possible to say, if the code is in the line of the csv, replace the data in the second column? (data[2])
All the occurances of stocklevel are getting replaced with the value of newlevel as you are calling s.replace (stocklevel ,newlevel).
string.replace(s, old, new[, maxreplace]): Return a copy of string s
with all occurrences of substring old replaced by new. If the optional
argument maxreplace is given, the first maxreplace occurrences are
replaced.
source
As you suggested, you need to get the code and use it replace the stock level.
This is a sample script which takes the 8 digit code and the new stock level as the command line arguments adn replaces it:
import sys
import re
code = sys.argv[1]
newval= int(sys.argv[2])
f=open("stockcontrol.csv")
data=f.readlines()
print data
for i,line in enumerate(data):
if re.search('%s,\d+'%code,line): # search for the line with 8-digit code
data[i] = '%s,%d\n'%(code,newval) # replace the stock value with new value in the same line
f.close()
f=open("in.csv","w")
f.write("".join(data))
print data
f.close()
Another solution using the csv module of Python:
import sys
import csv
data=[]
code = sys.argv[1]
newval= int(sys.argv[2])
f=open("stockcontrol.csv")
reader=csv.DictReader(f,fieldnames=['code','level'])
for line in reader:
if line['code'] == code:
line['level']= newval
data.append('%s,%s'%(line['code'],line['level']))
f.close()
f=open("stockcontrol.csv","w")
f.write("\n".join(data))
f.close()
Warning: Keep a back up of the input file while trying out these scripts as they overwrite the input file.
If you save the script in a file called test.py then invoke it as:
python test.py 34512340 10.
This should replace the stockvalue of code 34512340 to 10.
Why not using good old regular expressions?
import re
code, new_value = '11813651', '885' # e.g, change 85 to 885 for code 11813651
print (re.sub('(^%s,).*'%code,'\g<1>'+new_value,open('stockcontrol.csv').read()))
Since it's a csv file I'd suggest using Python's csv module. You will need to write to a new file since reading and writing to the same file will turn out bad. You can always rename it afterwards.
This example uses StringIO (Python 2) to embed your csv data in the code and treat it as a file. Normally you would open a file to get the input.
Updated
import csv
# Python 2 and 3
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
CSV = """\
34512340,1
12395675,2
56756777,1
90673412,2
12568673,3
22593672,5
65593691,4
98593217,2
98693214,2
98693399,5
11813651,85
98456390,8
98555567,3
98555550,45
98553655,2
96553657,1
91823656,2
99823658,2
"""
def replace(key, value):
fr = StringIO(CSV)
with open('out.csv', 'w') as fw:
r = csv.reader(fr)
w = csv.writer(fw)
for row in r:
if row[0] == key:
row[1] = value
w.writerow(row)
replace('99823658', 42)

Categories