python module ZipFile get base folder using regex - python

Assume this zip file "acme_example.zip" contains below content of the files/folders :
acme/one.txt
acme/one1.txt
acme/one2.txt
acme/one3.txt
acme/one4.txt
__MACOSX
.DS_Store
And i am using this below script
output_var = []
skip_st = '__MACOSX'
with ZipFile('acme_example.zip','r') as ZipObj:
listfFiles = ZipObj.namelist()
for elm in listfFiles:
p = Path(elm).parts[0]
if p not in output_var:
output_var.append(p)
return re.sub(skip_st, '', ''.join(str(item) for item in output_var))
This above script will exclude "__MAXOSX" but is there a way to also exclude ".DS_Store" so that we will only return "acme" as folder name?

As you iterate over the values, that would be better to exclude them at this moment, also as they are already strings, you can simplify the code in the join part
skip_st = ['__MACOSX', '.DS_Store']
with ZipFile('acme_example.zip','r') as ZipObj:
listfFiles = ZipObj.namelist()
for elm in listfFiles:
p = Path(elm).parts[0]
if p not in output_var and p not in skip_st:
output_var.append(p)
return ''.join(output_var)
So you know, here's how you can filter at the end
with a list
skip_st = ['__MACOSX', '.DS_Store']
# ...
return ''.join(item for item in output_var not in skip_st)
with a pattern
skip_st = '__MACOSX|.DS_Store'
# ...
return re.sub(skip_st, '', ''.join(output_var))

Related

How do I hierarchically sort URLs in python?

Given an initial list of URLs crawled from a site:
https://somesite.com/
https://somesite.com/advertise
https://somesite.com/articles
https://somesite.com/articles/read
https://somesite.com/articles/read/1154
https://somesite.com/articles/read/1155
https://somesite.com/articles/read/1156
https://somesite.com/articles/read/1157
https://somesite.com/articles/read/1158
https://somesite.com/blogs
I am trying to turn the list into a tab-organized tree hierarchy:
https://somesite.com
/advertise
/articles
/read
/1154
/1155
/1156
/1157
/1158
/blogs
I've tried using lists, tuples, and dictionaries. So far I have figured out two flawed ways to output the content.
Method 1 will miss elements if they have the same name and position in the hierarchy:
Input:
https://somesite.com
https://somesite.com/missions
https://somesite.com/missions/playit
https://somesite.com/missions/playit/extbasic
https://somesite.com/missions/playit/extbasic/0
https://somesite.com/missions/playit/stego
https://somesite.com/missions/playit/stego/0
Output:
https://somesite.com/
/missions
/playit
/extbasic
/0
/stego
----------------^ Missing expected output "/0"
Method 2 will not miss any elements, but it will print redundant content:
Input:
https://somesite.com
https://somesite.com/missions
https://somesite.com/missions/playit
https://somesite.com/missions/playit/extbasic
https://somesite.com/missions/playit/extbasic/0
https://somesite.com/missions/playit/stego
https://somesite.com/missions/playit/stego/0
Output:
https://somesite.com/
/missions
/playit
/extbasic
/0
/missions <- Redundant content
/playit <- Redundant content
/stego
/0
I'm not sure how to properly do this, and my googling has only turned up references to urllib that don't seem to be what I need. Perhaps there is a much better approach, but I have been unable to find it.
My code for getting the content into a usable list:
#!/usr/bin/python3
import re
# Read the original list of URLs from file
with open("sitelist.raw", "r") as f:
raw_site_list = f.readlines()
# Extract the prefix and domain from the first line
first_line = raw_site_list[0]
prefix, domain = re.match("(http[s]://)(.*)[/]" , first_line).group(1, 2)
# Remove instances of prefix and domain, and trailing newlines, drop any lines that are only a slash
clean_site_list = []
for line in raw_site_list:
clean_line = line.strip(prefix).strip(domain).strip()
if not clean_line == "/":
if not clean_line[len(clean_line) - 1] == "/":
clean_site_list += [clean_line]
# Split the resulting relative paths into their component parts and filter out empty strings
split_site_list = []
for site in clean_site_list:
split_site_list += [list(filter(None, site.split("/")))]
This gives a list to manipulate, but I've run out of ideas on how to output it without losing elements or outputting redundant elements.
Thanks
Edit: This is the final working code I put together based on the answer chosen below:
# Read list of URLs from file
with open("sitelist.raw", "r") as f:
urls = f.readlines()
# Remove trailing newlines
for url in urls:
urls[urls.index(url)] = url[:-1]
# Remove any trailing slashes
for url in urls:
if url[-1:] == "/":
urls[urls.index(url)] = url[:-1]
# Remove duplicate lines
unique_urls = []
for url in urls:
if url not in unique_urls:
unique_urls += [url]
# Do the actual work (modified to use unique_urls and use tabs instead of 4x spaces, and to write to file)
base = unique_urls[0]
tabdepth = 0
tlen = len(base.split('/'))
final_urls = []
for url in unique_urls[1:]:
t = url.split('/')
lt = len(t)
if lt != tlen:
tabdepth += 1 if lt > tlen else -1
tlen = lt
pad = ''.join(['\t' for _ in range(tabdepth)])
final_urls += [f'{pad}/{t[-1]}']
with open("sitelist.new", "wt") as f:
f.write(base + "\n")
for url in final_urls:
f.write(url + "\n")
This works with your sample data:
urls = ['https://somesite.com',
'https://somesite.com/missions',
'https://somesite.com/missions/playit',
'https://somesite.com/missions/playit/extbasic',
'https://somesite.com/missions/playit/extbasic/0',
'https://somesite.com/missions/playit/stego',
'https://somesite.com/missions/playit/stego/0']
base = urls[0]
print(base)
tabdepth = 0
tlen = len(base.split('/'))
for url in urls[1:]:
t = url.split('/')
lt = len(t)
if lt != tlen:
tabdepth += 1 if lt > tlen else -1
tlen = lt
pad = ''.join([' ' for _ in range(tabdepth)])
print(f'{pad}/{t[-1]}')
This code will help you in your task. I agree this code might be a bit large and might contain some redundant codes and checks but this will create a dictionary containing hierarchy of the urls, you can use that dictionary however you like, print it or store it.
More over this code will also parse different urls and create a seprate tree of them (see code and output)
EDIT: This will also take care of the redundant urls
Code:
from json import dumps
def process_urls(urls: list):
tree = {}
for url in urls:
url_components = url.split("/")
# First three components will be the protocol
# an empty entry
# and the base domain
base_domain = url_components[:3]
base_domain = base_domain[0] + "//" + "".join(base_domain[1:])
# Add base domain to tree if not there.
try:
tree[base_domain]
except:
tree[base_domain] = {}
structure = url_components[3:]
for i in range(len(structure)):
# add the first element
if i == 0 :
try:
tree[base_domain]["/"+structure[i]]
except:
tree[base_domain]["/"+structure[i]] = {}
else:
base = tree[base_domain]["/"+structure[0]]
for j in range(1, i):
base = base["/"+structure[j]]
try:
base["/"+structure[i]]
except:
base["/"+structure[i]] = {}
return tree
def print_tree(tree: dict, depth=0):
for key in tree.keys():
print("\t"*depth+key)
# redundant checks
if type(tree[key]) == dict:
# if dictionary is empty then do nothing
# else call this function recuressively
# increase depth by 1
if tree[key]:
print_tree(tree[key], depth+1)
if __name__ == "__main__":
urls = [
'https://somesite.com',
'https://somesite.com/missions',
'https://somesite.com/missions/playit',
'https://somesite.com/missions/playit/extbasic',
'https://somesite.com/missions/playit/extbasic/0',
'https://somesite.com/missions/playit/extbasic/0',
'https://somesite.com/missions/playit/extbasic/0',
'https://somesite.com/missions/playit/extbasic/0',
'https://somesite.com/missions/playit/stego',
'https://somesite.com/missions/playit/stego/0',
'https://somesite2.com/missions/playit',
'https://somesite2.com/missions/playit/extbasic',
'https://somesite2.com/missions/playit/extbasic/0',
'https://somesite2.com/missions/playit/stego',
'https://somesite2.com/missions/playit/stego/0'
]
tree = process_urls(urls)
print_tree(tree)
Output:
https://somesite.com
/missions
/playit
/extbasic
/0
/stego
/0
https://somesite2.com
/missions
/playit
/extbasic
/0
/stego
/0

get wanted data from a text file with python without using splits

Hello i have a that file:
WORKERS = yovel:10.0.0.6,james:10.0.0.7
BLACKLIST = 92.122.197.45:ynet,95.1.2.2:twitter
I'm trying to write a function in python that will get the worker IP and returns the worker name like this:
workername = getName(ip)
The only method i thougt to do it is with splits(using .split(":") , .split(",") etc.) but it will be very long code and not smart.
is there a shorter way to do it?
You can use re:
import re
def getName(ip, content = open('filename.txt').read()):
_r = re.findall('\w+(?=:{})'.format(ip), content)
return _r[0] if _r else None
print(getName('10.0.0.6'))
Output:
'yovel'
Note, however, it is slightly more robust to use split:
def getName(ip):
lines = dict(i.strip('\n').split(' = ') for i in open('filename.txt')]
d = {b:a for a, b in map(lambda x:x.split(':'), lines['WORKERS'].split(','))}
return d.get(ip)
Using split() doesn't look too bad here:
def getName(ip_address, filename='file.txt', line_type='WORKERS'):
with open(filename) as in_file:
for line in in_file:
name, info = [x.strip() for x in line.strip().split('=')]
if name == line_type:
info = [x.split(':') for x in info.split(',')]
lookup = {ip: name for name, ip in info}
return lookup.get(ip_address)
Which works as follows:
>>> getName('10.0.0.6')
'yovel'

Manipulating Python dictionaries to remove empty values

I'm trying to remove a key/value pair if the key contains 'empty' values.
I have tried the following dictionary comprehension and tried doing it in long form, but it doesn't seem to actually do anything and I get no errors.
def get_Otherfiles():
regs = ["(.*)((U|u)ser(.*))(\s=\s\W\w+\W)", "(.*)((U|u)ser(.*))(\s=\s\w+)", "(.*)((P|p)ass(.*))\s=\s(\W(.*)\W)", "(.*)((P|p)ass(.*))(\s=\s\W\w+\W)"]
combined = "(" + ")|(".join(regs) + ")"
cred_results = []
creds = []
un_matched = []
filesfound = []
d = {}
for root, dirs, files in os.walk(dir):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
d.setdefault(filename, [])
for line in readfile:
m = re.match(combined, line)
if m:
d[filename].append(m.group(0).rstrip())
else:
pass
result = d.copy()
result.update((k, v) for k, v in d.iteritems() if v is not None)
print result
Current output:
{'debug.txt': [], 'logonscript1.vbs': ['strUser = "guytom"', 'strPassword = "P#ssw0rd1"'], 'logonscript2.bat': ['strUsername = "guytom2"', 'strPass = "SECRETPASSWORD"']}
As you can see I have entries with empty values. I'd like to remove these before printing the data.
In this part of your code:
d.setdefault(filename, [])
for line in readfile:
m = re.match(combined, line)
if m:
d[filename].append(m.group(0).rstrip())
else:
pass
You always add filename as a key to the dictionary, even if you don't subsequently add anything to the resulting list. Try
for line in read file:
m = re.match(combined, line)
if m:
d.setdefault(filename, []).append(m.group(0).rstrip())
which will only initialize d[filename] to an empty list if it is actually necessary to have something on which to call append.
result = dict((k, v) for k, v in d.iteritems() if v is not None)
update wont remove entries ... it will only add or change
a = {"1":2}
a.update({"2":7})
print a # contains both "1" and "2" keys
Looking at the first matching group in your regex, (.*), if the regex matches but there are no characters to match, group(0) is "", not None. So, you can filter there.
result.update((k, v) for k, v in d.iteritems() if not v)
But you can also have your regex do that part for you. Change that first group to (.+) and you won't have empty values to filter out.
EDIT
Instead of removing empty values at the end, you can avoid adding them to the dict altogether.
def get_Otherfiles():
# fixes: make it a raw string so that \s works right and
# tighten up filtering, ... (U|u) should probably be [Uu] ...
regs = ["(.+)\s*((U|u)ser(.*))(\s=\s\W\w+\W)", "(.*)((U|u)ser(.*))(\s=\s\w+)", "(.*)((P|p)ass(.*))\s=\s(\W(.*)\W)", "(.*)((P|p)ass(.*))(\s=\s\W\w+\W)"]
combined = "(" + ")|(".join(regs) + ")"
cred_results = []
creds = []
un_matched = []
filesfound = []
d = {}
for root, dirs, files in os.walk(dir):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
# assuming you want to aggregate matching file names...
content_list = d.get(filename, [])
content_orig_len = len(content_list)
for line in readfile:
m = re.match(combined, line)
if m:
content_list.append(m.group(0))
if len(content_list) > content_orig_len:
d[filename] = content_list

use search to get matching list of files

I am using following function of a Class to find out if every .csv has corresponding .csv.meta in the given directory.
I am getting "None " for file which are just .csv and hexadecimal code for .csv.meta.
Result
None
<_sre.SRE_Match object at 0x1bb4300>
None
<_sre.SRE_Match object at 0xbd6378>
This is code
def validate_files(self,filelist):
try:
local_meta_file_list = []
local_csv_file_list = []
# Validate each files and see if they are pairing properly based on the pattern *.csv and *.csv.meta
for tmp_file_str in filelist:
csv_match = re.search(self.vprefix_pattern + '([0-9]+)' + self.vcsv_file_postfix_pattern + '$' , tmp_file_str)
if csv_match:
local_csv_file_list.append(csv_match.group())
meta_file_match_pattern=self.vprefix_pattern + csv_match.group(1) + self.vmeta_file_postfix_pattern
tmp_meta_file = [os.path.basename(s) for s in filelist if meta_file_match_pattern in s]
local_meta_file_list.extend(tmp_meta_file)
except Exception, e:
print e
self.m_logger.error("Error: Validate File Process thrown exception " + str(e))
sys.exit(1)
return local_csv_file_list, local_meta_file_list
These are file names.
File Names
rp_package.1406728501.csv.meta
rp_package.1406728501.csv
rp_package.1402573701.csv.meta
rp_package.1402573701.csv
rp_package.1428870707.csv
rp_package.1428870707.meta
Thanks
Sandy
If all you need is to find .csv files which have corresponding .csv.meta files, then I don’t think you need to use regular expressions for filtering them. We can filter the file list for those with the .csv extension, then filter that list further for files whose name, plus .meta, appears in the file list.
Here’s a simple example:
myList = [
'rp_package.1406728501.csv.meta',
'rp_package.1406728501.csv',
'rp_package.1402573701.csv.meta',
'rp_package.1402573701.csv',
'rp_package.1428870707.csv',
'rp_package.1428870707.meta',
]
def validate_files(file_list):
loc_csv_list = filter(lambda x: x[-3:].lower() == 'csv', file_list)
loc_meta_list = filter(lambda c: '%s.meta' % c in file_list, loc_csv_list)
return loc_csv_list, loc_meta_list
print validate_files(myList)
If there may be CSV files that don’t conform to the rp_package format, and need to be excluded, then we can initially filter the file list using the regex. Here’s an example (swap out the regex parameters as necessary):
import re
vprefix_pattern = 'rp_package.'
vcsv_file_postfix_pattern = '.csv'
regex_str = vprefix_pattern + '[0-9]+' + vcsv_file_postfix_pattern
def validate_files(file_list):
csv_list = filter(lambda x: re.search(regex_str, x), file_list)
loc_csv_list = filter(lambda x: x[-3:].lower() == 'csv', csv_list)
loc_meta_list = filter(lambda c: '%s.meta' % c in file_list, loc_csv_list)
return loc_csv_list, loc_meta_list
print validate_files(myList)

Python Error: String indices must be integers, not str

OK, I have an obvious problems staring me in the face that I can't figure out. I am getting the output/results I need but I get the TypeError: "string indices must be integers, not str". The following is a sample of my code. It is because of the statement "if f not in GetSquishySource(dirIn)" Basicially I am looking to see if a specific file is in another list so that I don't end up adding it to a zip file I am creating. I just don't see the problem here and how to get around it. Any help would be appreciated.
def compressLists(z, dirIn, dirsIn, filesIn, encrypt=None):
try:
with zipfile.ZipFile(z, 'w', compression=zipfile.ZIP_DEFLATED) as zip:
# Add files
compressFileList(z, dirIn, dirIn, filesIn, zip, encrypt)
# Add directories
for dir in dirsIn:
dirPath = os.path.join(dirIn, dir["name"])
for root, dirs, files in os.walk(dirPath):
# Ignore hidden files and directories
files = [f for f in files if not f[0] == '.']
dirs[:] = [d for d in dirs if not d[0] == '.']
# Replace file entries with structure value entries
for i, f in enumerate(files):
del files[i]
**if f not in GetSquishySource(dirIn):**
files.insert(i, {'zDir': dir["zDir"], 'name': f})
compressFileList(z, dirIn, root, files, zip, encryptedLua)
if dir["recurse"] == False:
break;
The following is the GetSquishySource function I created and call.
def GetSquishySource(srcDir):
squishyLines = []
srcToRemove = []
if os.path.isfile(srcDir + os.path.sep + "squishy"):
with open(srcDir + os.path.sep + "squishy") as squishyFile:
squishyContent = squishyFile.readlines()
squishyFile.close()
for line in squishyContent:
if line.startswith("Module") and line is not None:
squishyLines.append(line.split(' '))
for s in squishyLines:
if len(s) == 3 and s is not None:
# If the 3rd column in the squishy file contains data, use that.
path = s[2].replace('Module "', '').replace('"', '').replace("\n", '')
srcToRemove.append(os.path.basename(path))
elif len(s) == 2 and s is not None:
# If the 3rd column in the squishy file contains no data, then use the 2nd column.
path = s[1].replace('Module "', '').replace('"', '').replace("\n", '').replace(".", os.path.sep) + ".lua"
srcToRemove.append(os.path.basename(path))
return srcToRemove

Categories