multiple modification to a list at once - python

I have a text file of some ip's and Mac's. The format of the Mac's are xxxx.xxxx.xxxx, I need to change all the MAC's to xx:xx:xx:xx:xx:xx
I am already reading the file and putting it into a list. Now I am looping through each line of the list and I need to make multiple modification. I need to remove the IP's and then change the MAC format.
The problem I am running into is that I cant seem to figure out how to do this in one shot unless I copy the list to a newlist for every modification.
How can I loop through the list once, and update each element on the list with all my modification?
count = 0
output3 = []
for line in output:
#print(line)
#removes any extra spaces between words in a string.
output[count] = (str(" ".join(line.split())))
#create a new list with just the MAC addresses
output3.append(str(output[count].split(" ")[3]))
#create a new list with MAC's using a ":"
count += 1
print(output3)

It appears you are trying to overthink the problem, so that may be where your frustration is spinning you around a bit.
First, you should always consider if you need a count variable in python. Usually you do not, and the enumerate() function is your friend here.
Second, there is no need to process data multiple times in python. You can use variables to your advantage and leverage python's expressiveness, rather than trying to hide your problem from the language.
PSA an implementation example that may help you think through your approach. Good luck on solving your harder problems, and I hope python will help you out with them!
#! /usr/bin/env python3
import re
from typing import Iterable
# non-regex reformat mac to be xx:xx:xx:xx:xx:xx
# NOTE: this assumes a source with '.' separators only
# reformat_mac = lambda _: ':'.join(_ for _ in _.split('.') for _ in (_[:2], _[2:]))
# regex reformat mac to be xx:xx:xx:xx:xx:xx
# NOTE: Only requires at least two hex digits adjacent at a time
reformat_mac = lambda _: ":".join(re.findall(r"(?i)[\da-f]{2}", _))
def generate_output3(output: Iterable[str]) -> Iterable[str]:
for line in output:
col1, col2, col3, mac, *cols = line.split()
mac = reformat_mac(mac)
yield " ".join((col1, col2, col3, mac, *cols))
if __name__ == "__main__":
output = [
"abc def ghi 1122.3344.5566",
"jklmn op qrst 11a2.33c4.55f6 uv wx yz",
"zyxwu 123 next 11a2.33c4.55f6 uv wx yz",
]
for line in generate_output3(output):
print(line)

Solution
You can use the regex (regular expression) module to extract any pattern that matches that of the
mac-ids: "xxxx:xxxx:xxxx" and then process it to produce the expected output ("xx-xx-xx-xx-xx-xx")
as shown below.
Note: I have used a dummy data file (see section: Dummy Data below) to make this answer
reproducible. It should work with your data as well.
# import re
filepath = "input.txt"
content = read_file(filepath)
mac_ids = extract_mac_ids(content, format=True) # format=False --> "xxxx:xxxx:xxxx"
print(mac_ids)
## OUTPUT:
#
# ['a0-b1-ff-33-ac-d5',
# '11-b9-33-df-55-f6',
# 'a4-d1-e7-33-ff-55',
# '66-a1-b2-f3-b9-c5']
Code: Convenience Functions
How does the regex work? see this example
def read_file(filepath: str):
"""Reads and returns the content of a file."""
with open(filepath, "r") as f:
content = f.read() # read in one attemp
return content
def format_mac_id(mac_id: str):
"""Returns a formatted mac_id.
INPUT FORMAT: "xxxxxxxxxxxx"
OUTPUT FORMAT: "xx-xx-xx-xx-xx-xx"
"""
mac_id = list(mac_id)
mac_id = ''.join([ f"-{v}" if (i % 2 == 0) else v for i, v in enumerate(mac_id)])[1:]
return mac_id
def extract_mac_ids(content: str, format: bool=True):
"""Extracts and returns a list of formatted mac_ids after.
INPUT FORMAT: "xxxx:xxxx:xxxx"
OUTPUT FORMAT: "xx-xx-xx-xx-xx-xx"
"""
import re
# pattern = "(" + ':'.join([r"\w{4}"]*3) + "|" + ':'.join([r"\w{2}"]*6) + ")"
# pattern = r"(\w{4}:\w{4}:\w{4}|\w{2}:\w{2}:\w{2}:\w{2}:\w{2}:\w{2})"
pattern = r"(\w{4}:\w{4}:\w{4})"
pat = re.compile(pattern)
mac_ids = pat.findall(content) # returns a list of all mac-ids
# Replaces the ":" with "" and then formats
# each mac-id as: "xx-xx-xx-xx-xx-xx"
if format:
mac_ids = [format_mac_id(mac_id.replace(":", "")) for mac_id in mac_ids]
return mac_ids
Dummy Data
The following code block creates a dummy file with some sample mac-ids.
filepath = "input.txt"
s = """
a0b1:ff33:acd5 ghwvauguvwi ybvakvi
klasilvavh; 11b9:33df:55f6
haliviv
a4d1:e733:ff55
66a1:b2f3:b9c5
"""
# Create dummy data file
with open(filepath, "w") as f:
f.write(s)

Related

Parse lines in files with similar strings using Python

AH! I'm new to Python. Trying to get the pattern here, but could use some assistance to get unblocked.
Scenario:
testZip.zip file with test.rpt files inside
The .rpt files have multiple areas of interest ("AOI") to parse
AOI1: Line starting with $$
AOI2: Multiple lines starting with a single $
Goal:
To get AOI's into tabular format for upload to SQL
Sample file:
$$ADD ID=TEST BATCHID='YEP' PASSWORD=NOPE
###########################################################################################
$KEY= 9/21/2020 3:53:55 PM/2002/B0/295.30/305.30/4FAOA973_3.0_v2.19.2.0_20150203_1/20201002110149
$TIMESTAMP= 20201002110149
$MORECOLUMNS= more columns
$YETMORE = yay
Tried so far:
import zipfile
def get_aoi1(zip):
z = zipfile.ZipFile(zip)
for f in z.namelist():
with z.open(f, 'r') as rptf:
for l in rptf.readlines():
if l.find(b"$$") != -1:
return l
def get_aoi2(zip):
z = zipfile.ZipFile(zip)
for f in z.namelist():
with z.open(f, 'r') as rptf:
for l in rptf.readlines():
if l.find(b"$") != -1:
return l
aoi1 = get_aoi1('testZip.zip')
aoi2 = get_aoi2('testZip.zip')
print(aoi1)
print(aoi2)
Results:
I get the same results for both functions
b"$$ADD ID=TEST BATCHID='YEP' PASSWORD=NOPE\r\n"
b"$$ADD ID=TEST BATCHID='YEP' PASSWORD=NOPE\r\n"
How do I get the results in text instead of bytes (b) and remove the \r\n from AOI1?
There doesn't seem to be an r option for z.open()
I've been unsuccessful with .strip()
EDIT 1:
Thanks for the pep #furas!
return l.strip().decode() worked for removing the new line and b
How do I get the correct results from AOI2 (lines with a single $ in a tabular format)?
EDIT 2:
#furas 2021!
Adding the following logic to aoi2 function worked great.
col_array = []
for l in rptf.readlines():
if not l.startswith(b"$$") and l.startswith(b"$"):
col_array.append(l)
return col_array

Extract and modify substring from file path

I have a file path saved as filepath in the form of /home/user/filename. Some examples of what the filename could be:
'1990MAlogfile'
'Tantrologfile'
'2003RF_2004logfile'
I need to write something that turns the filepath into just part of the filename (but I do not have just the filename saved as anything yet). For example:
/home/user/1990MAlogfile becomes '1990 MA', /home/user/Tantrologfile becomes 'Tantro', or /home/user/2003RF_2004logfile becomes '2003 RF'.
So I need everything after the last forward slash and before an underscore if it's present (or before the 'logfile' if it's not), and then I need to insert a space between the last number and first letter if there are numbers present. Then I'd like to save the outcome as objkey. Any idea on how I could do this? I was thinking I could use regex, but don't know how I would handle inserting a space in those certain cases.
Code
def get_filename(filepath):
import re
temp = os.path.basename(example)[:-7].split('_')[0]
a = re.findall('^[0-9]*', temp)[0]
b = temp[len(a):]
return ' '.join([a, b])
example = '/home/user/2003RF_2004logfile'
objkey = get_filename(example)
Explanation
import regular expression package
import re
example filepath
example = '/home/user/2003RF_2004logfile'
/home/user/2003RF_2004logfile
get the filename and remove everything after the _
temp = example.split('/')[-1].split('_')[0]
2003RF
get the beginning portion (splits if numbers at the beginning)
a = re.findall('^[0-9]*', temp)[0]
2003
get the end portion
b = temp[len(a):]
RF
combine the beginning and end portions
return ' '.join([a, b])
2003 RF
import os, re, string
mystr = 'home/user/2003RF_2004logfile'
def format_str(str):
end = os.path.split(mystr)[-1]
m1 = re.match('(.+)logfile', end)
try:
this = m1.group(1)
this = this.split('_')[0]
except AttributeError:
return None
m2 = re.match('(.+[0-9])(.+)', this)
try:
return " ".join([m2.group(1), m2.group(2)])
except AttributeError:
return this

python newbie - where is my if/else wrong?

Complete beginner so I'm sorry if this is obvious!
I have a file which is name | +/- or IG_name | 0 in a long list like so -
S1 +
IG_1 0
S2 -
IG_S3 0
S3 +
S4 -
dnaA +
IG_dnaA 0
Everything which starts with IG_ has a corresponding name. I want to add the + or - to the IG_name. e.g. IG_S3 is + like S3 is.
The information is gene names and strand information, IG = intergenic region. Basically I want to know which strand the intergenic region is on.
What I think I want:
open file
for every line, if the line starts with IG_*
find the line with *
print("IG_" and the line it found)
else
print line
What I have:
with open(sys.argv[2]) as geneInfo:
with open(sys.argv[1]) as origin:
for line in origin:
if line.startswith("IG_"):
name = line.split("_")[1]
nname = name[:-3]
for newline in geneInfo:
if re.match(nname, newline):
print("IG_"+newline)
else:
print(line)
where origin is the mixed list and geneInfo has only the names not IG_names.
With this code I end up with a list containing only the else statements.
S1 +
S2 -
S3 +
S4 -
dnaA +
My problem is that I don't know what is wrong to search so I can (attempt) to fix it!
Below is some step-by-step annotated code that hopefully does what you want (though instead of using print I have aggregated the results into a list so you can actually make use of it). I'm not quite sure what happened with your existing code (especially how you're processing two files?)
s_dict = {}
ig_list = []
with open('genes.txt', 'r') as infile: # Simulating reading the file you pass in sys.argv
for line in infile:
if line.startswith('IG_'):
ig_list.append(line.split()[0]) # Collect all our IG values for later
else:
s_name, value = line.split() # Separate out the S value and its operator
s_dict[s_name] = value.strip() # Add to dictionary to map S to operator
# Now you can go back through your list of IG values and append the appropriate operator
pulled_together = []
for item in ig_list:
s_value = item.split('_')[1]
# The following will look for the operator mapped to the S value. If it is
# not found, it will instead give you 'not found'
corresponding_operator = s_dict.get(s_value, 'Not found')
pulled_together.append([item, corresponding_operator])
print ('List structure')
print (pulled_together)
print ('\n')
print('Printout of each item in list')
for item in pulled_together:
print(item[0] + '\t' + item[1])
nname = name[:-3]
Python's slicing through list is very powerful, but can be tricky to understand correctly.
When you write [:-3], you take everything except the last three items. The thing is, if you have less than three element in your list, it does not return you an error, but an empty list.
I think this is where things does not work, as there are not much elements per line, it returns you an empty list. If you could tell what do you exactly want it to return there, with an example or something, it would help a lot, as i don't really know what you're trying to get with your slicing.
Does this do what you want?
from __future__ import print_function
import sys
# Read and store all the gene info lines, keyed by name
gene_info = dict()
with open(sys.argv[2]) as gene_info_file:
for line in gene_info_file:
tokens = line.split()
name = tokens[0].strip()
gene_info[name] = line
# Read the other file and lookup the names
with open(sys.argv[1]) as origin_file:
for line in origin_file:
if line.startswith("IG_"):
name = line.split("_")[1]
nname = name[:-3].strip()
if nname in gene_info:
lookup_line = gene_info[nname]
print("IG_" + lookup_line)
else:
pass # what do you want to do in this case?
else:
print(line)

Faster operation reading file

I have to process a 15MB txt file (nucleic acid sequence) and find all the different substrings (size 5). For instance:
ABCDEF
would return 2, as we have both ABCDE and BCDEF, but
AAAAAA
would return 1. My code:
control_var = 0
f=open("input.txt","r")
list_of_substrings=[]
while(f.read(5)!=""):
f.seek(control_var)
aux = f.read(5)
if(aux not in list_of_substrings):
list_of_substrings.append(aux)
control_var += 1
f.close()
print len(list_of_substrings)
Would another approach be faster (instead of comparing the strings direct from the file)?
Depending on what your definition of a legal substring is, here is a possible solution:
import re
regex = re.compile(r'(?=(\w{5}))')
with open('input.txt', 'r') as fh:
input = fh.read()
print len(set(re.findall(regex, input)))
Of course, you may replace \w with whatever you see fit to qualify as a legal character in your substring. [A-Za-z0-9], for example will match all alphanumeric characters.
Here is an execution example:
>>> import re
>>> input = "ABCDEF GABCDEF"
>>> set(re.findall(regex, input))
set(['GABCD', 'ABCDE', 'BCDEF'])
EDIT: Following your comment above, that all character in the file are valid, excluding the last one (which is \n), it seems that there is no real need for regular expressions here and the iteration approach is much faster. You can benchmark it yourself with this code (note that I slightly modified the functions to reflect your update regarding the definition of a valid substring):
import timeit
import re
FILE_NAME = r'input.txt'
def re_approach():
return len(set(re.findall(r'(?=(.{5}))', input[:-1])))
def iter_approach():
return len(set([input[i:i+5] for i in xrange(len(input[:-6]))]))
with open(FILE_NAME, 'r') as fh:
input = fh.read()
# verify that the output of both approaches is identicle
assert set(re.findall(r'(?=(.{5}))', input[:-1])) == set([input[i:i+5] for i in xrange(len(input[:-6]))])
print timeit.repeat(stmt = re_approach, number = 500)
print timeit.repeat(stmt = iter_approach, number = 500)
15MB doesn't sound like a lot. Something like this probably would work fine:
import Counter, re
contents = open('input.txt', 'r').read()
counter = Counter.Counter(re.findall('.{5}', contents))
print len(counter)
Update
I think user590028 gave a great solution, but here is another option:
contents = open('input.txt', 'r').read()
print set(contents[start:start+5] for start in range(0, len(contents) - 4))
# Or using a dictionary
# dict([(contents[start:start+5],True) for start in range(0, len(contents) - 4)]).keys()
You could use a dictionary, where each key is a substring. It will take care of duplicates, and you can just count the keys at the end.
So: read through the file once, storing each substring in the dictionary, which will handle finding duplicate substrings & counting the distinct ones.
Reading all at once is more i/o efficient, and using a dict() is going to be faster than testing for existence in a list. Something like:
fives = {}
buf = open('input.txt').read()
for x in xrange(len(buf) - 4):
key = buf[x:x+5]
fives[key] = 1
for keys in fives.keys():
print keys

Replace recursively from a replacement map

I have a dictionary in the form
{'from.x': 'from.changed.x',...}
possibly quite big, and I have to substitute in text files accordingly to that dictionary in a quite big directory structure.
I didn't find anything which might any nice solution and I end up:
using os.walk
iterating through the dictionary
writing everything out
WIth something like:
def fix_imports(top_dir, not_ui_keys):
"""Walk through the directory and substitute the wrong imports
"""
repl = {}
for n in not_ui_keys:
# interleave a model in between
dotted = extract_dotted(n)
if dotted:
repl[dotted] = add_model(dotted)
for root, dirs, files in walk(top_dir):
py_files = [path.join(root, x) for x in files if x.endswith('.py')]
for py in py_files:
res = replace_text(open(py).read(), repl)
def replace_text(orig_text, replace_map):
res = orig_text
# now try to grep all the keys, using a translate maybe
# with a dictionary of the replacements
for to_replace in replace_map:
res.replace(to_replace, replace_map[to_replace])
# now print the differences
for un in unified_diff(res.splitlines(), orig_text.splitlines()):
print(un)
return res
Is there any better/nicer/faster way to do it?
EDIT:
Clarifying a bit the problem, the substitution are generated from a function, and they are all in the form:
{'x.y.z': 'x.y.added.z', 'x.b.a': 'x.b.added.a'}
And yes, sure I should better use regexps, I just thought I didn't need them this time.
I don't think it can help much, however, because I can't really formalize the whole range of substitutions with only one (or multiple) regexps..
I would write the first function using generators:
def fix_imports(top_dir, not_ui_keys):
"""Walk through the directory and substitute the wrong imports """
from itertools import imap,ifilter
gen = ifilter(None,imap(extract_dotted, not_ui_keys))
repl = dict((dotted,add_model(dotted)) for dotted in gen)
py_files = (path.join(root, x)
for root, dirs, files in walk(top_dir)
for x in files if x[-3:]=='.py')
for py in py_files:
with open(py) as opf:
res = replace_text(opf.read(), repl)
x[-3:]=='.py' is faster than x.endswith('.py')
Thank you everyone, and about the problem of substituting from a mapping in many files, I think I have a working solution:
def replace_map_to_text(repl_map, text_lines):
"""Take a dictionary with the replacements needed and a list of
files and return a list with the substituted lines
"""
res = []
concat_st = "(%s)" % "|".join(repl_map.keys())
# '.' in non raw regexp means one of any characters, so must be
# quoted ore we need a way to make the string a raw string
concat_st = concat_st.replace('.', '\.')
combined_regexp = re.compile(concat_st)
for line in text_lines:
found = combined_regexp.search(line)
if found:
expr = found.group(1)
new_line = re.sub(expr, repl_map[expr], line)
logger.info("from line %s to line %s" % (line, new_line))
res.append(new_line)
else:
res.append(line)
return res
def test_replace_string():
lines = ["from psi.io.api import x",
"from psi.z import f"]
expected = ["from psi.io.model.api import x",
"from psi.model.z import f"]
mapping = {'psi.io.api': 'psi.io.model.api',
'psi.z': 'psi.model.z'}
assert replace_map_to_text(mapping, lines) == expected
In short I compose a big regexp in the form
(first|second|third)
Then I search for it in every line and substitute with re.sub if something was found.
Still a bit rough but the simple test after works fine.
EDIT: fixed a nasty bug in the concatenation, because if it's not a raw string '.' means only one character, not a '.'

Categories