I have a list of ts files inside a folder. I try to extract the content id from the XML which is the filename without extension. I need to search for a ts file that matches the content id. For some reason, it's failing. I am attaching the code below. I am also attaching the screenshot for the ts files.
import glob
import lxml.etree as et
import os, csv
ASSET_METADATA_PATH = '/Users/roradhak/eVision/failed_assets/'
TS_PATH = '/Users/roradhak/eVision/ts_check/'
def parse_file(path):
tree = et.parse(path)
root = tree.getroot()
trailer_id = ""
programs = root.xpath('Program[#title="Program"]')
if len(programs) == 0:
return None, None, None
program = programs[0] # TODO - Are multiple programs expected? If so, the function should return a list of tuples
# Get the Content ID
c_id = program.xpath('props/*[#title="Content ID"]')
if len(c_id) == 0:
content_id = None
else:
content_id = c_id[0].text
# Get the has_trailer attribute
has_t = program.xpath('props/*[#title="Has_Trailer"]')
has_trailer = has_t[0].text
if has_t[0].text =="Y":
trailer_id = content_id.replace('M','T',1)
# Get the content name
n = program.xpath('props/*[#title="Name"]')
if len(n) == 0:
content_name = None
else:
content_name = n[0].text
return content_id, content_name, has_trailer, trailer_id
def main():
asset_metadata = glob.glob(os.path.join(ASSET_METADATA_PATH, u'*.xml'))
movies = glob.glob(os.path.join(TS_PATH, u'*.ts'))
for p in asset_metadata:
print(u'Processing: {p}'.format(p=p).encode('utf-8'))
print content_id, content_name, has_trailer, trailer_id
content_id, content_name, has_trailer, trailer_id= parse_file(p)
if u'{c}.ts'.format(c=content_id) not in TS_PATH:
print "No Movie"
if has_trailer =="Y":
if u'{c}.ts'.format(c=trailer_id) not in movies:
print "No trailer"
if __name__ == '__main__':
main()
Output as below
/Users/roradhak/IVPGET_Local/venv/bin/python /Users/roradhak/Downloads/validate_xml.py
Processing: /Users/roradhak/eVision/failed_assets/E30000001557115265_2019_08_29T11_20_08Z.xml
MD009232 Ep 143 - Cool look Hair style N
No Movie
Processing: /Users/roradhak/eVision/failed_assets/10000000717960000_2019_10_09T15_04_20Z.xml
MZ008931 Aan: Men At Work Y TZ008931
No Movie
No trailer
Processing: /Users/roradhak/eVision/failed_assets/E30000001557537308_2019_08_09T19_15_22Z.xml
MZ010564 EP29 - Episode 29 - Raheem S1 Y TZ010564
No Movie
No trailer
Process finished with exit code 0
Here is how I would do it with pathlib and Python 3.4+:
from pathlib import Path
failed_assets_folder = Path('/Users/roradhak/eVision/failed_assets')
ts_folder = Path('/Users/roradhak/eVision/ts_check')
def main():
for failed_asset in failed_assets_folder.glob('*.xml'):
print(f'Processing: {failed_asset.name}')
content_id, content_name, has_trailer, trailer_id = parse_file(failed_asset.name)
print(f'{content_id}, {content_name}, {has_trailer}, {trailer_id}')
if not Path(ts_folder / f'{content_id}.ts').exists():
print('No Movie')
if has_trailer == 'Y':
if not Path(ts_folder / f'{trailer_id}.ts').exists():
print('No trailer')
It just implements the file search portion and it is not tested though.
Related
I would like to create different Word-Documents using a template and an excel-input-file.
So I read the xlsx-file, change the content of the template, and want to save several Docx-files. I use the following code which generally works fine for the first document (anything gets replaced and stored as expected). But in the following documents, there is always the same content as in the first document. I tried to reassign the for every row in the excel-sheet the document with
docWork = doc
But it seems that somehow this initialization is not working.
This is the full code I am using:
from docx import Document
import os, sys
import xlwings as xw
import time
from DateTime import DateTime
if __name__ == '__main__':
print(f"Start Program V8...")
SAVE_INTERVAL = 5
WAIT = 3
FN = "dataCreateCover.xlsx"
path = os.path.abspath(os.path.dirname(sys.argv[0]))
fn = os.path.join(path, FN)
print(f"Read {fn}...")
wb = xw.Book (fn)
ws = wb.sheets[0]
inpData = ws.range ("A2:Z5000").value
inpData = [x for x in inpData if x[0] != None]
tday = str(datetime.today().date())
print(f"Read DOCX-Files...")
FNDOC = "template.docx"
fnDoc = os.path.join(path, FNDOC)
print(f"Path for DOCX: {fnDoc}...")
doc = Document(fnDoc)
for elem in inpData:
dictWords = {}
docWork = doc
elem = [x for x in elem if x != None]
for idxElem, valElem in enumerate(elem):
dictWords[f"[section{idxElem + 1}]"] = valElem
for idx,para in enumerate(docWork.paragraphs):
for k,v in dictWords.items():
if k in para.text:
inline = para.runs
for item in inline:
if k in item.text:
item.text = item.text.replace(k, v)
print(f"Replaced {k} with {v}...")
break
docFN = f"{tday}_{elem[1]}.docx"
docWork.save(docFN)
print(f"Document <{docFN}> created - pls press <enter> to close the window...")
How can I use the docx-template and write different word-docs as output?
Guys do anyone know how to read event log file in C:\Windows\System32\winevt\Logs with .evtx extension?
I have already tried to open it using notepad and read using python but notepad says access is denied...
Do anyone know how to do it? Thanks in advance..
This is how you would read the file "Forwarded Events" from the event viewer. You need admin access so I would run it as admin but I it will prompt you for a password if you don't.
import win32evtlog
import xml.etree.ElementTree as ET
import ctypes
import sys
def is_admin():
try:
return ctypes.windll.shell32.IsUserAnAdmin()
except:
return False
if is_admin():
# open event file
query_handle = win32evtlog.EvtQuery(
'C:\Windows\System32\winevt\Logs\ForwardedEvents.evtx',
win32evtlog.EvtQueryFilePath)
read_count = 0
a = 1
while a == 1:
a += 1
# read 1 record(s)
events = win32evtlog.EvtNext(query_handle, 1)
read_count += len(events)
# if there is no record break the loop
if len(events) == 0:
break
for event in events:
xml_content = win32evtlog.EvtRender(event, win32evtlog.EvtRenderEventXml)
# parse xml content
xml = ET.fromstring(xml_content)
# xml namespace, root element has a xmlns definition, so we have to use the namespace
ns = '{http://schemas.microsoft.com/win/2004/08/events/event}'
substatus = xml[1][9].text
event_id = xml.find(f'.//{ns}EventID').text
computer = xml.find(f'.//{ns}Computer').text
channel = xml.find(f'.//{ns}Channel').text
execution = xml.find(f'.//{ns}Execution')
process_id = execution.get('ProcessID')
thread_id = execution.get('ThreadID')
time_created = xml.find(f'.//{ns}TimeCreated').get('SystemTime')
#data_name = xml.findall('.//EventData')
#substatus = data_name.get('Data')
#print(substatus)
event_data = f'Time: {time_created}, Computer: {computer}, Substatus: {substatus}, Event Id: {event_id}, Channel: {channel}, Process Id: {process_id}, Thread Id: {thread_id}'
print(event_data)
user_data = xml.find(f'.//{ns}UserData')
# user_data has possible any data
else:
ctypes.windll.shell32.ShellExecuteW(None, "runas", sys.executable, " ".join(sys.argv), None, 1)
input()
.evtx is the extension for Windows Eventlog files. It contains data in a special binary format designed by Microsoft so you cannot simply open it in a text editor.
The are open source tools to read .evtx and the NXLog EE can also read .evtx files. (Disclaimer: I'm affiliated with the latter).
I modified the accepted answer a bit as following, so it becomes reusable:
import xml.etree.ElementTree as Et
import win32evtlog
from collections import namedtuple
class EventLogParser:
def __init__(self, exported_log_file):
self.exported_log_file = exported_log_file
def get_all_events(self):
windows_events = []
query_handle = win32evtlog.EvtQuery(str(self.exported_log_file),
win32evtlog.EvtQueryFilePath | win32evtlog.EvtQueryReverseDirection)
while True:
raw_event_collection = win32evtlog.EvtNext(query_handle, 1)
if len(raw_event_collection) == 0:
break
for raw_event in raw_event_collection:
windows_events.append(self.parse_raw_event(raw_event))
return windows_events
def parse_raw_event(self, raw_event):
xml_content = win32evtlog.EvtRender(raw_event, win32evtlog.EvtRenderEventXml)
root = Et.fromstring(xml_content)
ns = "{" + root.tag.split('}')[0].strip('{') + "}"
system = root.find(f'{ns}System')
event_id = system.find(f'{ns}EventID').text
level = system.find(f'{ns}Level').text
time_created = system.find(f'{ns}TimeCreated').get('SystemTime')
computer = system.find(f'{ns}Computer').text
WindowsEvent = namedtuple('WindowsEvent',
'event_id, level, time_created, computer')
return WindowsEvent(event_id, level, time_created, computer)
I use the "python-evtx" library, you can install it using this command:
pip install python-evtx
In my case, I'm not interested in reading records with the "Information" level.
import os
import codecs
from lxml import etree
import Evtx.Evtx as evtx
def evtxFile(absolutePath, filenameWithExt, ext, _fromDate, _toDate):
print("Reading: " + filenameWithExt)
outText = ""
channel = ""
#read the windows event viewer log and convert its contents to XML
with codecs.open(tempFilePath, "a+", "utf-8", "ignore") as tempFile:
with evtx.Evtx(absolutePath) as log:
for record in log.records():
xmlLine = record.xml()
xmlLine = xmlLine.replace(" xmlns=\"http://schemas.microsoft.com/win/2004/08/events/event\"", "")
xmlParse = etree.XML(xmlLine)
level = parseXMLtoString(xmlParse, ".//Level/text()")
if not level == "0" and not level == "4":
providerName = parseXMLtoString(xmlParse, ".//Provider/#Name")
qualifiers = parseXMLtoString(xmlParse, ".//EventID/#Qualifiers")
timestamp = parseXMLtoString(xmlParse, ".//TimeCreated/#SystemTime")
eventID = parseXMLtoString(xmlParse, ".//EventID/text()")
task = parseXMLtoString(xmlParse, ".//Task/text()")
keywords = parseXMLtoString(xmlParse, ".//Keywords/text()")
eventRecordID = parseXMLtoString(xmlParse, ".//EventRecordID/text()")
channel = parseXMLtoString(xmlParse, ".//Channel/text()")
computer = parseXMLtoString(xmlParse, ".//Computer/text()")
message = parseXMLtoString(xmlParse, ".//Data/text()")
if level == "1":
level = "Critical"
elif level == "2":
level = "Error"
elif level == "3":
level = "Warning"
date = timestamp[0:10]
time = timestamp[11:19]
time = time.replace(".", "")
_date = datetime.strptime(date, "%Y-%m-%d").date()
if _fromDate <= _date <= _toDate:
message = message.replace("<string>", "")
message = message.replace("</string>", "")
message = message.replace("\r\n", " ")
message = message.replace("\n\r", " ")
message = message.replace("\n", " ")
message = message.replace("\r", " ")
outText = date + " " + time + "|" + level + "|" + message.strip() + "|" + task + "|" + computer + "|" + providerName + "|" + qualifiers + "|" + eventID + "|" + eventRecordID + "|" + keywords + "\n"
tempFile.writelines(outText)
with codecs.open(tempFilePath, "r", "utf-8", "ignore") as tempFile2:
myLinesFromDateRange = tempFile2.readlines()
#delete the temporary file that was created
os.remove(tempFilePath)
if len(myLinesFromDateRange) > 0:
createFolder("\\filtered_data_files\\")
outFilename = "windows_" + channel.lower() + "_event_viewer_logs" + ext
myLinesFromDateRange.sort()
#remove duplicate records from the list
myFinalLinesFromDateRange = list(set(myLinesFromDateRange))
myFinalLinesFromDateRange.sort()
with codecs.open(os.getcwd() + "\\filtered_data_files\\" + outFilename, "a+", "utf-8", "ignore") as linesFromDateRange:
linesFromDateRange.seek(0)
if len(linesFromDateRange.read(100)) > 0:
linesFromDateRange.writelines("\n")
linesFromDateRange.writelines(myFinalLinesFromDateRange)
del myLinesFromDateRange[:]
del myFinalLinesFromDateRange[:]
else:
print("No data was found within the specified date range.")
print("Closing: " + filenameWithExt)
I hope it helps you or someone else in the future.
EDIT:
The "tempFilePath" can be anything you want, for example:
tempFilePath = os.getcwd() + "\\tempFile.txt"
I collected some information first before calling the "evtxFile" function:
The "From" and the "To" dates are in the following format: YYYY-MM-DD
Converted the dates to "date" data type:
_fromDate = datetime.strptime(fromDate, "%Y-%m-%d").date()
_toDate = datetime.strptime(toDate, "%Y-%m-%d").date()
Divided the directory where the .evtx files are located into different parts:
def splitDirectory(root, file):
absolutePathOfFile = os.path.join(root, file)
filePathWithoutFilename = os.path.split(absolutePathOfFile)[0]
filenameWithExt = os.path.split(absolutePathOfFile)[1]
filenameWithoutExt = os.path.splitext(filenameWithExt)[0]
extension = os.path.splitext(filenameWithExt)[1]
return absolutePathOfFile, filePathWithoutFilename, filenameWithExt, filenameWithoutExt, extension
for root, subFolders, files in os.walk(directoryPath):
for f in files:
absolutePathOfFile, filePathWithoutFilename, filenameWithExt,
filenameWithoutExt, extension = splitDirectory(root, f)
if extension == ".evtx":
evtxFile(absolutePathOfFile, filenameWithExt, ".txt", _fromDate, _toDate)
I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).
The task:
The firm I have gotten a summer-job for has an expanding test-database that consists of an increasing number of subfolders for each project, that includes everything from .jpeg files to the .xlsx's I am interested in. As I am a bit used to Python from earlier, I decided to give it a go at this task. I want to search for exceldocuments that has "test spreadsheet" as a part of its title(for example "test spreadsheet model259"). All the docs I am interested in are built the same way(weight is always "A3" etc), looking somewhat like this:
Model: 259
Lenght: meters 27
Weight: kg 2500
Speed: m/s 25
I want the user of the finished program to be able to compare results from different tests with each other using my script. This means that the script must see if there is an x-value that fits both criteria at once:
inputlength = x*length of model 259
inputweight = x*weight of model 259
The program should loop through all the files in the main folder. If such an X exists for a model, I want the program to return it to a list of fitting models. The x-value will be a variable, different for each model.
As the result I want a list of all files that fits the input, their scale(x-value) and possibly a link to the file.
For example:
Model scale Link
ModelA 21.1 link_to_fileA
ModelB 0.78 link_to_fileB
The script
The script I have tried to get to work so far is below, but if you have other suggestions of how to deal with the task I'll happily accept them. Don't be afraid to ask if I have not explained the task well enough. XLRD is already installed, and I use Eclipse as my IDE. I've been trying to get it to work in many ways now, so most of my script is purely for testing.
Edited:
#-*- coding: utf-8 -*-
#Accepts norwegian letters
import xlrd, os, fnmatch
folder = 'C:\eclipse\TST-folder'
def excelfiles(pattern):
file_list = []
for root, dirs, files in os.walk(start_dir):
for filename in files:
if fnmatch.fnmatch(filename.lower(), pattern):
if filename.endswith(".xls") or filename.endswith(".xlsx") or filename.endswith(".xlsm"):
file_list.append(os.path.join(root, filename))
return file_list
file_list = excelfiles('*tst*') # only accept docs hwom title includes tst
print excelfiles()
How come I only get one result when I am printing excelfiles() after returning the values, but when I exchange "return os.path.join(filename)" with "print os.path.join(filename)" it shows all .xls files? Does this mean that the results from the excelfiles-function is not passed on? Answered in comments
''' Inputvals '''
inputweight = int(raw_input('legg inn vekt')) #inputbox for weight
inputlength = int(raw_input('legg inn lengd')) #inputbox for lenght
inputspeed = int(raw_input('legg inn hastighet')) #inputbox for speed
'''Location of each val from the excel spreadsheet'''
def locate_vals():
val_dict = {}
for filename in file_list:
wb = xlrd.open_workbook(os.path.join(start_dir, filename))
sheet = wb.sheet_by_index(0)
weightvalue = sheet.cell_value(1, 1)
lenghtvalue = sheet.cell_value(1, 1)
speedvalue = sheet.cell_value(1, 1)
val_dict[filename] = [weightvalue, lenghtvalue, speedvalue]
return val_dict
val_dict = locate_vals()
print locate_vals()
count = 0
Any ideas of how I can read from each of the documents found by the excelfiles-function? "funcdox" does not seem to work. When I insert a print-test, for example print weightvalue after the weightvalue = sheet.cell(3,3).value function, I get no feedback at all. Errormessages without the mentioned print-test:Edited to the script above, which creates a list of the different values + minor changes that removed the errormessages
Script works well until this point
Made some minor changes to the next part. It is supposed to scale an value from the spreadsheet by multiplying it with a constant (x1). Then I want the user to be able to define another inputvalue, which in turn defines another constant(x2) to make the spreadsheetvalue fit. Eventually, these constants will be compared to find which models will actually fit for the test.
'''Calculates vals from excel from the given dimensions'''
def dimension(): # Maybe exchange exec-statement with the function itself.
if count == 0:
if inputweight != 0:
exec scale_weight()
elif inputlenght != 0:
exec scale_lenght()
elif inputspeed != 0:
exec scale_speed()
def scale_weight(x1, x2): # Repeat for each value.
for weightvalue in locate_vals():
if count == 0:
x1 * weightvalue == inputweight
count += 1
exec criteria2
return weightvalue, x1
elif count == 2:
inputweight2 = int(raw_input('Insert weight')) #inputbox for weight
x2 * weightvalue == inputweight2
return weightvalue, x2
The x1 and x2 are what I want to find with this function, so I want them to be totally "free". Is there any way I can test this function without having to insert values for x1 and x2 ?
def scale_lenght(): # Almost identical to scale_weight
return
def scale_speed(): # Almost identical to scale_weight
return
def criteria2(weight, lenght, speed):
if count == 1:
k2 = raw_input('Criteria two, write weight, length or speed.')
if k2 == weight:
count += 1
exec scale_weight
elif k2 == lenght:
count += 1
exec scale_lenght
elif k2 == speed:
count += 1
exec scale_speed
else:
return
Do you see any easier way to deal with this problem?(Hope I managed to explain it well enough. The way I have written the code so far is quite messy, but since I'm not that experienced I'll just have to make it work first, and then clean it up if I have the time.
Since probably none of the values will exactly fit for both x-constants, I thought I'd use approx_Equal to deal with it:
def approx_Equal(x1, x2, tolerance=int(raw_input('Insert tolerance for scaling difference')),err_msg='Unacceptable tolerance', verbose = True ): # Gives the approximation for how close the two values of x must be for
if x1 == x2:
x = x1+ (x2-x1)/2
return x
Eventually, I'd like a diagram of all the variables used + a link-to-file and name for each document.
No sure how I will do this, so any tips are greatly appreciated.
Thanks!
In answer to the first question "How come I only get one result when I am printing excelfiles()" this is because your return statement is within the nested loop, so the function will stop on the first iteration. I would try building up a list instead and then return this list, you could also combine this with the issue of checking the name e.g. :
import os, fnmatch
#globals
start_dir = os.getenv('md')
def excelfiles(pattern):
file_list = []
for root, dirs, files in os.walk(start_dir):
for filename in files:
if fnmatch.fnmatch(filename.lower(), pattern):
if filename.endswith(".xls") or filename.endswith(".xlsx") or filename.endswith(".xlsm"):
file_list.append(os.path.join(root, filename))
return file_list
file_list = excelfiles('*cd*')
for i in file_list: print i
Obviously, you'll need to replace the cd with your own search text, but keep the * either side and replace the start_dir with your own. I have done the match on filename.lower() and entered the search text in lower case to make the matching case in-sensitive, just remove the .lower() if you don't want this. I have also allowed for other types of Excel files.
Regarding reading data from Excel files I have done this before to create an automated way of converting basic Excel files into csv format. You are welcome to have a look at the code below and see if there is anything you can use from this. The xl_to_csv function is where the data is read from the Excel file:
import os, csv, sys, Tkinter, tkFileDialog as fd, xlrd
# stop tinker shell from opening as only needed for file dialog
root = Tkinter.Tk()
root.withdraw()
def format_date(dt):
yyyy, mm, dd = str(dt[0]), str(dt[1]), str(dt[2])
hh, mi, ss = str(dt[3]), str(dt[4]), str(dt[5])
if len(mm) == 1:
mm = '0'+mm
if len(dd) == 1:
dd = '0'+dd
if hh == '0' and mi == '0' and ss == '0':
datetime_str = dd+'/'+mm+'/'+yyyy
else:
if len(hh) == 1:
hh = '0'+hh
if len(mi) == 1:
mi = '0'+mi
if len(ss) == 1:
ss = '0'+ss
datetime_str = dd+'/'+mm+'/'+yyyy+' '+hh+':'+mi+':'+ss
return datetime_str
def xl_to_csv(in_path, out_path):
# set up vars to read file
wb = xlrd.open_workbook(in_path)
sh1 = wb.sheet_by_index(0)
row_cnt, col_cnt = sh1.nrows, sh1.ncols
# set up vars to write file
fileout = open(out_path, 'wb')
writer = csv.writer(fileout)
# iterate through rows and cols
for r in range(row_cnt):
# make list from row data
row = []
for c in range(col_cnt):
#print "...debug - sh1.cell(",r,c,").value set to:", sh1.cell(r,c).value
#print "...debug - sh1.cell(",r,c,").ctype set to:", sh1.cell(r,c).ctype
# check data type and make conversions
val = sh1.cell(r,c).value
if sh1.cell(r,c).ctype == 2: # number data type
if val == int(val):
val = int(val) # convert to int if only no decimal other than .0
#print "...debug - res 1 (float to str), val set to:", val
elif sh1.cell(r,c).ctype == 3: # date fields
dt = xlrd.xldate_as_tuple(val, 0) # date no from excel to dat obj
val = format_date(dt)
#print "...debug - res 2 (date to str), val set to:", val
elif sh1.cell(r,c).ctype == 4: # boolean data types
val = str(bool(val)) # convert 1 or 0 to bool true / false, then string
#print "...debug - res 3 (bool to str), val set to:", val
else:
val = str(val)
#print "...debug - else, val set to:", val
row.append(val)
#print ""
# write row to csv file
try:
writer.writerow(row)
except:
print '...row failed in write to file:', row
exc_type, exc_value, exc_traceback = sys.exc_info()
lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
for line in lines:
print '!!', line
print 'Data written to:', out_path, '\n'
def main():
in_path, out_path = None, None
# set current working directory to user's my documents folder
os.chdir(os.path.join(os.getenv('userprofile'),'documents'))
# ask user for path to Excel file...
while not in_path:
print "Please select the excel file to read data from ..."
try:
in_path = fd.askopenfilename()
except:
print 'Error selecting file, please try again.\n'
# get dir for output...
same = raw_input("Do you want to write the output to the same directory? (Y/N): ")
if same.upper() == 'Y':
out_path = os.path.dirname(in_path)
else:
while not out_path:
print "Please select a directory to write the csv file to ..."
try:
out_path = fd.askdirectory()
except:
print 'Error selecting file, please try again.\n'
# get file name and join to dir
f_name = os.path.basename(in_path)
f_name = f_name[:f_name.find('.')]+'.csv'
out_path = os.path.join(out_path,f_name)
# get data from file and write to csv...
print 'Attempting read data from', in_path
print ' and write csv data to', out_path, '...\n'
xl_to_csv(in_path, out_path)
v_open = raw_input("Open file (Y/N):").upper()
if v_open == 'Y':
os.startfile(out_path)
sys.exit()
if __name__ == '__main__':
main()
Let me know if you have any questions on this.
Finally, regarding the output I would consider writing this out to a html file in a table format. Let me know if you want any help with this, I will have some more sample code that you could use part of.
UPDATE
Here is some further advice on writing your output to a html file. Here is a function that I have written and used previously for this purpose. Let me know if you need any guidance on what you would need to change for your implementation (if anything). The function expects a nested object in the data argument e.g. a list of lists or list of tuples etc. but should work for any number of rows / columns:
def write_html_file(path, data, heads):
html = []
tab_attr = ' border="1" cellpadding="3" style="background-color:#FAFCFF; text-align:right"'
head_attr = ' style="background-color:#C0CFE2"'
# opening lines needed for html table
try:
html.append('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" ')
html.append('"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> ')
html.append('<html xmlns="http://www.w3.org/1999/xhtml">')
html.append('<body>')
html.append(' <table'+tab_attr+'>')
except:
print 'Error setting up html heading data'
# html table headings (if required)
if headings_on:
try:
html.append(' <tr'+head_attr+'>')
for item in heads:
html.append(' '*6+'<th>'+str(item)+'</th>')
html.append(' </tr>')
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
print 'Error writing html table headings:'
print ''.join('!! ' + line for line in lines)
# html table content
try:
for row in data:
html.append(' <tr>')
for item in row:
html.append(' '*6+'<td>'+str(item)+'</td>')
html.append(' </tr>')
except:
print 'Error writing body of html data'
# closing lines needed
try:
html.append(' </table>')
html.append('</body>')
html.append('</html>')
except:
print 'Error closing html data'
# write html data to file
fileout = open(path, 'w')
for line in html:
fileout.write(line)
print 'Data written to:', path, '\n'
if sql_path:
os.startfile(path)
else:
v_open = raw_input("Open file (Y/N):").upper()
if v_open == 'Y':
os.startfile(path)
headings_on is a global that I have set to True in my script, you will also need to import traceback for the error handling to work as it is currently specified.
I want to use following codes to replace strings like "/xxxxx/" with "/xxxxx.html" in the page_data, but doesn't work. page_data is bytes type which is downloaded by a crawler.
page_data.replace(each, neweach)
Only when I change them to:
page_data = page_data.replace(each, neweach)
the strings(each) in page_data are actually replaceed.
The whole code is below:
import os
import sys
import re
import urllib
import urllib2
class WebGet(object):
base_url = ""
urls_list = []
history_list = []
replace_ch={}
def __init__(self, base_url):
self.base_url = base_url[:-1]
self.urls_list.append('/')
self.replace_ch[">>"] = "%3E%3E"
self.replace_ch["<<"] = "%3C%3C"
self.replace_ch["::"] = "%3A%3A"
def recurseGet(self):
'''Get page data recursively'''
while(len(self.urls_list) != 0):
url_suffix = self.urls_list[0]
self.urls_list.remove(url_suffix)
self.history_list.append(url_suffix)
url_to_get = self.base_url + url_suffix
"Get page data with url"
print "To get",url_to_get
page_data = urllib2.urlopen(url_to_get).read()
page_data_done = self.pageHandle(page_data)
"Write the page data into file"
if url_suffix[-1] == '/':
url_suffix = url_suffix[:-1]
if url_suffix == '':
url_suffix = "index"
elif url_suffix[0] == '/':
url_suffix = url_suffix[1:]
url_suffix.replace('/','\\')
url_suffix.replace('>>','%3E%3E')
url_suffix.replace('<<','%3C%3C')
url_suffix.replace('::','%3A%3A')
file_str = "e:\\reference\\"+url_suffix
if file_str.rfind("\\") != 12:
new_dir = file_str[:file_str.rfind("\\")]
if os.path.isdir(file_str) == False:
os.mkdir(file_str)
file_str = file_str.strip()+".html"
print "write file",file_str
f_page = open(file_str, "wb")
f_page.write(page_data_done)
f_page.close
def pageHandle(self, page_data):
page_data.replace("http://www.cplusplus.com/","/") #here the replace works
re_rule = '<a href="/reference(/\S{2,40}/)\">'
list_page_urls = re.findall(re_rule, page_data)
for each in list_page_urls:
neweach = each
neweach = neweach[:-1]+".html"
#page_data = page_data.replace(each, neweach)
page_data.replace(each, neweach)
if each in page_data:
print "fail replace"
if each in self.history_list:
continue
elif each in self.urls_list:
continue
elif each == '/':
continue
self.urls_list.append(each)
return page_data
def main():
url = "http://www.cplusplus.com/reference/"
fc = WebGet(url)
fc.recurseGet()
if __name__ == "__main__":
main()
Why could be this?
Because that's what the replace method does: returns a copy of the string with the relevant characters replaced.
Apart from anything else, strings are immutable in Python, so it couldn't work any other way.