I have the following script that wrote in order to parse some logs. The script works fine, but now i want to add something that will count the duplicates and add that to the list. I have been trying to add that little piece for the last few days and i can't figure out how to do it. Any help would be very much appreciated. I am trying to add it in the function called strip_dupes, or maybe i am looking at it wrong and need to create a new function all together to do this.
import sys
import collections
def strip_dupes(lst):
seen = set()
keep = []
for sd in lst:
if sd in seen:
print(“Duplicate Found: %s” % (sd,))
else:
seen.add(sd)
keep.append(sd)
final_list = keep
return final_list
def format_traffic(ltpl,fname):
ofile = open(output_file, “w”)
for s,d,t in ltpl:
ofile.write(“Source %s -> Destination: %s -> Service: %s\n” % (s,d,t))
ofile.close()
log_file = open(sys.argv[1], “r”)
lines = log_file.readlines()
output_file = sys.argv[2]
traffic_list = []
for l in lines:
words = l.split()
source = words[9].strip(‘“‘)
dest = words[10].strip(‘“‘)
serv = words[7].strip(‘“’)
flow = (source, dest, serv)
traffic_list.append(flow)
final = strip_dupes(traffic_list)
format_traffic(final,output_file)
log_file.close()
Related
This code works but I have to call all the files one by one, I need to call only the folder where the files are and to save the results in another folder.
I am not figuring out :( Can anybody help me, I'm new in Python. Thank you I appreciate :)
import re
import string
import sys
frequency = {}
sys.stdin = open('C:/Users/Desktop/app/data/sources/books/test.txt', 'r')
sys.stdout =open('C:/Users/Desktop/app/data/fre/news/test.txt', 'w')
text_string = sys.stdin.read()
match_pattern = re.findall(r'([-][\w]+)', text_string)
for word in match_pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for word in frequency_list:
print (word, frequency[word])
Maybe something like this?
import glob
import os
books = glob.glob("C:/Users/Desktop/app/data/sources/books/*.txt")
# now you have a list of all .txt files in that directory.
def writer(text_string, output_file):
"""A function to write out items from an input text string"""
frequency = {}
match_pattern = re.findall(r'([-][\w]+)', text_string)
for word in match_pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for word in frequency_list:
print(word, frequency[word], file=open(output_file, "a"))
# now you have a function that essentially does the procedure you already know works
for book in books:
book_name = os.path.split(book)[-1] # get <filename>.txt from the path
# context manager will close the stream when you're done
with open(book, "r") as file:
text_string = file.read()
output_file = "C:/Users/Desktop/app/data/fre/news/" + book_name
writer(text_string, output_file)
This code will iterate through the .txt files in the directory you were reading from.
I encapsulated your working code in a function (somewhat reformatted for clarity, you can specify where to print to directly from the print function), so as you iterate through the files you can read them in and drop them through the working code.
I have the following code:
import os
import json
import ipaddress
iplist = []
ipiflist = []
mydict = {}
for filename in os.listdir('data/'):
with open(os.path.join('data/', filename), 'r') as f:
data = json.load(f)
mydict.update(data)
print(mydict)
In the data directory there are several JSON files that I open in this loop.
I update the dict in every loop and for this reason I get the following output:
{'ipif_1001': '10.10.160.129', 'ipif_1002': '10.10.160.142', 'ipif_1003': '10.10.160.169', 'ipif_1004': '10.10.160.173', 'ipif_3334': '10.10.160.194', 'IpIf3337': '10.10.160.126'}
{'ipif_1001': '10.10.160.129', 'ipif_1002': '10.10.160.142', 'ipif_1003': '10.10.160.170', 'ipif_1004': '10.10.160.174', 'ipif_3334': '10.10.160.194', 'IpIf3337': '10.10.160.126', 'ipif_1005': '10.10.160.178', 'ipif_1006': '10.10.160.182'}
{'ipif_1001': '10.10.160.129', 'ipif_1002': '10.10.160.142', 'ipif_1003': '10.10.160.170', 'ipif_1004': '10.10.160.174', 'ipif_3334': '10.10.160.194', 'IpIf3337': '10.10.160.126', 'ipif_1005': '10.10.160.178', 'ipif_1006': '10.10.160.182', 'IpIf1001': '10.10.160.138', 'IpIf1002': '10.10.160.141', 'IpIf1003': '10.10.160.153', 'IpIf1006': '10.10.160.181', 'IpIf_CPEDCN': '10.10.160.241', 'IpIf_DCNMgt': '10.10.191.253', 'ipif1164': '10.10.160.166', 'IpIf1010': '10.10.170.1'}
I only need the summarized output from the last loop. How can I only access this?
Thanks for your help
The for loop in python has an else statement, which will only be executed when the loop was successful. Thus there you can plot your last resut?
for filename in os.listdir('data/'):
with open(os.path.join('data/', filename), 'r') as f:
data = json.load(f)
mydict.update(data)
else:
print(mydict)
import os
import json
import ipaddress
iplist = []
ipiflist = []
mydict = {}
list = os.listdir('data/')
for filename in os.listdir('data/'):
with open(os.path.join('data/', filename), 'r') as f:
data = json.load(f)
if list[list.count-1] == filename: #check last filename in the directory with the current filename in the loop
mydict.update(data)
print(mydict)
Try it like this
I am trying to get the reverse sequences orientated correctly in a file. This is the code:
import os
import sys import pysam
from Bio import SeqIO, Seq, SeqRecord
def main(in_file):
out_file = "%s.fa" % os.path.splitext(in_file)[0]
with open(out_file, "w") as out_handle:
# Write records from the BAM file one at a time to the output file.
# Works lazily as BAM sequences are read so will handle large files.
SeqIO.write(bam_to_rec(in_file), out_handle, "fasta")
def bam_to_rec(in_file):
"""Generator to convert BAM files into Biopython SeqRecords.
"""
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
if read.is_reverse:
seq = seq.reverse_complement()
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
if __name__ == "__main__":
main(*sys.argv[1:])`
When I print out the reverse sequences, the code works. But when in the file it is printed out as a reverse sequence. Can anyone help me to find out what is going wrong?
Here is the link to my infile:
https://www.dropbox.com/sh/68ui8l7nh5fxatm/AABUr82l01qT1nL8I_XgJaeTa?dl=0
Note the ugly counter is just to print 10000 sequences, not more.
comparing one without ever reversing with one that reverses if needed
Here's the output on a couple of seqs, feel free to test it, I think your issue is that yield returns an iterator but you are not iterating it, unless I am missunderstanding what you are doing:
Original:
SOLEXA-1GA-2:2:93:1281:961#0
GGGTTAGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG
Becomes:
SOLEXA-1GA-2:2:93:1281:961#0
CTAACCCTAACCCTAACCCTAACCCTAACCTAACCC
And if not reverse:
Original:
SOLEXA-1GA-2:2:12:96:1547#0
ACACACAAACACACACACACACACACACACACCCCC
Becomes:
SOLEXA-1GA-2:2:12:96:1547#0
ACACACAAACACACACACACACACACACACACCCCC
Here's my code:
import os
import sys
import pysam
from Bio import SeqIO, Seq, SeqRecord
def main(in_file):
out_file = "%s.fa" % os.path.splitext(in_file)[0]
with open('test_non_reverse.txt', 'w') as non_reverse:
with open(out_file, "w") as out_handle:
# Write records from the BAM file one at a time to the output file.
# Works lazily as BAM sequences are read so will handle large files.
i = 0
for s in bam_to_rec(in_file):
if i == 10000:
break
i +=1
SeqIO.write(s, out_handle, "fasta")
i = 0
for s in convert_to_seq(in_file):
if i == 10000:
break
i +=1
SeqIO.write(s, non_reverse, 'fasta')
def convert_to_seq(in_file):
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
def bam_to_rec(in_file):
"""Generator to convert BAM files into Biopython SeqRecords.
"""
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
if read.is_reverse:
seq = seq.reverse_complement()
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
if __name__ == "__main__":
main(*sys.argv[1:])
I was trying to make a tool that updates yaml values in files that have "PENDING" in them. It does work, but I need it to be formatted like this:
fields:
setName: ("name")
WishName: ("name")
WishNameState: ("PENDING")
However, it wants to dump it in this format:
fields: {WishName: ("name"), WishNameState: ("APPROVED"), setName: ("name")}
How can I make it dump in the format I want it to?
Here's my code, so you know how I'm currently doing it:
import glob
import os
import yaml
def processFile(f,t):
data = open(f,'rb').read()
lines = data.replace('\r\n','\n').split('\n')
lines_found = []
for i,x in enumerate(lines):
if t in x:
lines_found.append(i+1)
return lines_found
term = 'PENDING'
for x in glob.glob('*.yaml'):
r = processFile(x,term)
if r:
with open(x) as f:
yamlfile = yaml.load(f)
fields = yamlfile['fields']
name = fields['WishName']
print('Name: ' + name)
print('Approve or reject?')
aor = raw_input('a/r: ')
if aor == 'a':
fields['setName'] = name
fields['WishNameState'] = '("APPROVED")'
with open(x, "w") as f:
yaml.dump(yamlfile, f)
elif aor == 'r':
fields['WishNameState'] = '("REJECTED")'
with open(x, "w") as f:
yaml.dump(yamlfile, f)
else:
'Invalid response. Shutting down...'
sys.exit()
print('End of results!')
Any and all help is appreciated! Thanks :)
In your code, replace
yaml.dump(yamlfile, f)
with
yaml.dump(yamlfile, f, default_flow_style=False)
I am trying to get a program up and running that takes astronomical data files with the extension .fits and takes all of the files with that extension in a folder and searches for specific header information, and subsequently places it into a text folder corresponding to each file. I am using a while loop, and please forgive me if this code is badly formatted, it is my first time using python! My main problem is that I can only get the program to read one file before it closes itself.
#!/usr/bin/env python
#This code properly imports all '.fits' files in a specified directory and
#outputs them into a .txt format that allows several headers and their contained
#data to be read.
import copy
import sys
import pyfits
import string
import glob
import os.path
import fnmatch
import numpy as np
DIR = raw_input("Please input a valid directory : ") #-----> This prompts for input from the user to find the '.fits' files
os.chdir(DIR)
initialcheck = 0 #Initiates the global counter for the number of '.fits' files in the specified directory
targetcheck = 0 #Initiates the global counter for the amount of files that have been processed
def checkinitial(TD):
#This counts the number of '.fits' files in your directory
for files in glob.iglob('*.fits'):
check = len(glob.glob1(TD,"*.fits"))
global initialcheck
initialcheck = check
if initialcheck == 0:
print 'There are no .FITS files in this directory! Try Again...'
sys.exit()
return initialcheck
def sorter(TD, targcheck, inicheck):
#This function will call the two counters and compare them until the number of processed files is greater than the files in the #directory, thereby finishing the loop
global initialcheck
inicheck = initialcheck
global targetcheck
targcheck = targetcheck
while targcheck <= inicheck:
os.walk(TD)
for allfiles in glob.iglob('*.fits'):
print allfiles #This prints out the filenames the porgram is currently processing
with pyfits.open(allfiles) as HDU:
#This block outlines all of the search terms in their respective headers, you will need to set the indices #below to search in the correct header for the specified term you are looking for, however no alterations to #the header definitions should be made.
HDU_HD_0 = HDU[0].header
HDU_HD_1 = HDU[1].header
#HDU_HD_2 = HDU[2].header -----> Not usually needed, can be activated if data from this header is required
#HDU_HD_3 = HDU[3].header -----> Use this if the '.fits' file contains a third header (unlikely but possible)
KeplerIDIndex = HDU_HD_0.index('KEPLERID')
ChannelIndex = HDU_HD_0.index('SKYGROUP')
TTYPE1Index = HDU_HD_1.index('TTYPE1')
TTYPE8Index = HDU_HD_1.index('TTYPE8')
TTYPE9Index = HDU_HD_1.index('TTYPE9')
TTYPE11Index = HDU_HD_1.index('TTYPE11')
TTYPE12Index = HDU_HD_1.index('TTYPE12')
TTYPE13Index = HDU_HD_1.index('TTYPE13')
TTYPE14Index = HDU_HD_1.index('TTYPE14')
TUNIT1Index = HDU_HD_1.index('TUNIT1')
TUNIT8Index = HDU_HD_1.index('TUNIT8')
TUNIT9Index = HDU_HD_1.index('TUNIT9')
TUNIT11Index = HDU_HD_1.index('TUNIT11')
TUNIT12Index = HDU_HD_1.index('TUNIT12')
TUNIT13Index = HDU_HD_1.index('TUNIT13')
TUNIT14Index = HDU_HD_1.index('TUNIT14')
#The below variables are an index search for the data found in the specified indices above, allowing the data #to be found in teh numpy array that '.fits' files use
File_Data_KID = list( HDU_HD_0[i] for i in [KeplerIDIndex])
File_Data_CHAN = list( HDU_HD_0[i] for i in [ChannelIndex])
Astro_Data_1 = list( HDU_HD_1[i] for i in [TTYPE1Index])
Astro_Data_8 = list( HDU_HD_1[i] for i in [TTYPE8Index])
Astro_Data_9 = list( HDU_HD_1[i] for i in [TTYPE9Index])
Astro_Data_11 = list( HDU_HD_1[i] for i in [TTYPE11Index])
Astro_Data_12 = list( HDU_HD_1[i] for i in [TTYPE12Index])
Astro_Data_13 = list( HDU_HD_1[i] for i in [TTYPE13Index])
Astro_Data_14 = list( HDU_HD_1[i] for i in [TTYPE14Index])
Astro_Data_Unit_1 = list( HDU_HD_1[i] for i in [TUNIT1Index])
Astro_Data_Unit_8 = list( HDU_HD_1[i] for i in [TUNIT8Index])
Astro_Data_Unit_9 = list( HDU_HD_1[i] for i in [TUNIT9Index])
Astro_Data_Unit_11 = list( HDU_HD_1[i] for i in [TUNIT11Index])
Astro_Data_Unit_12 = list( HDU_HD_1[i] for i in [TUNIT12Index])
Astro_Data_Unit_13 = list( HDU_HD_1[i] for i in [TUNIT13Index])
Astro_Data_Unit_14 = list( HDU_HD_1[i] for i in [TUNIT14Index])
HDU.close()
with open('Processed ' + allfiles + ".txt", "w") as copy:
targetcheck += 1
Title1_Format = '{0}-----{1}'.format('Kepler I.D.','Channel')
Title2_Format = '-{0}--------{1}------------{2}------------{3}------------{4}------------{5}-------------{6}-'.format('TTYPE1','TTYPE8','TTYPE9','TTYPE11','TTYPE12','TTYPE13','TTYPE14')
File_Format = '{0}--------{1}'.format(File_Data_KID, File_Data_CHAN)
Astro_Format = '{0}---{1}---{2}---{3}---{4}---{5}---{6}'.format(Astro_Data_1, Astro_Data_8, Astro_Data_9, Astro_Data_11, Astro_Data_12, Astro_Data_13, Astro_Data_14)
Astro_Format_Units = '{0} {1} {2} {3} {4} {5} {6}'.format(Astro_Data_Unit_1, Astro_Data_Unit_8, Astro_Data_Unit_9, Astro_Data_Unit_11, Astro_Data_Unit_12, Astro_Data_Unit_13, Astro_Data_Unit_14)
copy.writelines("%s\n" % Title1_Format)
copy.writelines( "%s\n" % File_Format)
copy.writelines('\n')
copy.writelines("%s\n" % Title2_Format)
copy.writelines( "%s\n" % Astro_Format)
copy.writelines('\n')
copy.writelines( "%s\n" % Astro_Format_Units)
Results = copy
return Results
checkinitial(DIR)
sorter(DIR, targetcheck, initialcheck)
I think you keep getting confused between a single file and a list of files. Try something like this:
def checkinitial(TD):
#This counts the number of '.fits' files in your directory
check = len(glob.glob1(TD,"*.fits"))
if not check:
print 'There are no .FITS files in this directory! Try Again...'
sys.exit()
return check
def sorter(TD, targcheck, inicheck):
"""This function will call the two counters and compare them until the number of processed
files is greater than the files in the directory, thereby finishing the loop
"""
for in_file in glob.iglob(os.path.join(TD,'*.fits')):
print in_file # This prints out the filenames the program is currently processing
with pyfits.open(in_file) as HDU:
# <Process input file HDU here>
out_file_name = 'Processed_' + os.path.basename(in_file) + ".txt"
with open(os.path.join(TD, out_file_name), "w") as copy:
# <Write stuff to your output file copy here>