How to read .evtx file using python? - python

Guys do anyone know how to read event log file in C:\Windows\System32\winevt\Logs with .evtx extension?
I have already tried to open it using notepad and read using python but notepad says access is denied...
Do anyone know how to do it? Thanks in advance..

This is how you would read the file "Forwarded Events" from the event viewer. You need admin access so I would run it as admin but I it will prompt you for a password if you don't.
import win32evtlog
import xml.etree.ElementTree as ET
import ctypes
import sys
def is_admin():
try:
return ctypes.windll.shell32.IsUserAnAdmin()
except:
return False
if is_admin():
# open event file
query_handle = win32evtlog.EvtQuery(
'C:\Windows\System32\winevt\Logs\ForwardedEvents.evtx',
win32evtlog.EvtQueryFilePath)
read_count = 0
a = 1
while a == 1:
a += 1
# read 1 record(s)
events = win32evtlog.EvtNext(query_handle, 1)
read_count += len(events)
# if there is no record break the loop
if len(events) == 0:
break
for event in events:
xml_content = win32evtlog.EvtRender(event, win32evtlog.EvtRenderEventXml)
# parse xml content
xml = ET.fromstring(xml_content)
# xml namespace, root element has a xmlns definition, so we have to use the namespace
ns = '{http://schemas.microsoft.com/win/2004/08/events/event}'
substatus = xml[1][9].text
event_id = xml.find(f'.//{ns}EventID').text
computer = xml.find(f'.//{ns}Computer').text
channel = xml.find(f'.//{ns}Channel').text
execution = xml.find(f'.//{ns}Execution')
process_id = execution.get('ProcessID')
thread_id = execution.get('ThreadID')
time_created = xml.find(f'.//{ns}TimeCreated').get('SystemTime')
#data_name = xml.findall('.//EventData')
#substatus = data_name.get('Data')
#print(substatus)
event_data = f'Time: {time_created}, Computer: {computer}, Substatus: {substatus}, Event Id: {event_id}, Channel: {channel}, Process Id: {process_id}, Thread Id: {thread_id}'
print(event_data)
user_data = xml.find(f'.//{ns}UserData')
# user_data has possible any data
else:
ctypes.windll.shell32.ShellExecuteW(None, "runas", sys.executable, " ".join(sys.argv), None, 1)
input()

.evtx is the extension for Windows Eventlog files. It contains data in a special binary format designed by Microsoft so you cannot simply open it in a text editor.
The are open source tools to read .evtx and the NXLog EE can also read .evtx files. (Disclaimer: I'm affiliated with the latter).

I modified the accepted answer a bit as following, so it becomes reusable:
import xml.etree.ElementTree as Et
import win32evtlog
from collections import namedtuple
class EventLogParser:
def __init__(self, exported_log_file):
self.exported_log_file = exported_log_file
def get_all_events(self):
windows_events = []
query_handle = win32evtlog.EvtQuery(str(self.exported_log_file),
win32evtlog.EvtQueryFilePath | win32evtlog.EvtQueryReverseDirection)
while True:
raw_event_collection = win32evtlog.EvtNext(query_handle, 1)
if len(raw_event_collection) == 0:
break
for raw_event in raw_event_collection:
windows_events.append(self.parse_raw_event(raw_event))
return windows_events
def parse_raw_event(self, raw_event):
xml_content = win32evtlog.EvtRender(raw_event, win32evtlog.EvtRenderEventXml)
root = Et.fromstring(xml_content)
ns = "{" + root.tag.split('}')[0].strip('{') + "}"
system = root.find(f'{ns}System')
event_id = system.find(f'{ns}EventID').text
level = system.find(f'{ns}Level').text
time_created = system.find(f'{ns}TimeCreated').get('SystemTime')
computer = system.find(f'{ns}Computer').text
WindowsEvent = namedtuple('WindowsEvent',
'event_id, level, time_created, computer')
return WindowsEvent(event_id, level, time_created, computer)

I use the "python-evtx" library, you can install it using this command:
pip install python-evtx
In my case, I'm not interested in reading records with the "Information" level.
import os
import codecs
from lxml import etree
import Evtx.Evtx as evtx
def evtxFile(absolutePath, filenameWithExt, ext, _fromDate, _toDate):
print("Reading: " + filenameWithExt)
outText = ""
channel = ""
#read the windows event viewer log and convert its contents to XML
with codecs.open(tempFilePath, "a+", "utf-8", "ignore") as tempFile:
with evtx.Evtx(absolutePath) as log:
for record in log.records():
xmlLine = record.xml()
xmlLine = xmlLine.replace(" xmlns=\"http://schemas.microsoft.com/win/2004/08/events/event\"", "")
xmlParse = etree.XML(xmlLine)
level = parseXMLtoString(xmlParse, ".//Level/text()")
if not level == "0" and not level == "4":
providerName = parseXMLtoString(xmlParse, ".//Provider/#Name")
qualifiers = parseXMLtoString(xmlParse, ".//EventID/#Qualifiers")
timestamp = parseXMLtoString(xmlParse, ".//TimeCreated/#SystemTime")
eventID = parseXMLtoString(xmlParse, ".//EventID/text()")
task = parseXMLtoString(xmlParse, ".//Task/text()")
keywords = parseXMLtoString(xmlParse, ".//Keywords/text()")
eventRecordID = parseXMLtoString(xmlParse, ".//EventRecordID/text()")
channel = parseXMLtoString(xmlParse, ".//Channel/text()")
computer = parseXMLtoString(xmlParse, ".//Computer/text()")
message = parseXMLtoString(xmlParse, ".//Data/text()")
if level == "1":
level = "Critical"
elif level == "2":
level = "Error"
elif level == "3":
level = "Warning"
date = timestamp[0:10]
time = timestamp[11:19]
time = time.replace(".", "")
_date = datetime.strptime(date, "%Y-%m-%d").date()
if _fromDate <= _date <= _toDate:
message = message.replace("<string>", "")
message = message.replace("</string>", "")
message = message.replace("\r\n", " ")
message = message.replace("\n\r", " ")
message = message.replace("\n", " ")
message = message.replace("\r", " ")
outText = date + " " + time + "|" + level + "|" + message.strip() + "|" + task + "|" + computer + "|" + providerName + "|" + qualifiers + "|" + eventID + "|" + eventRecordID + "|" + keywords + "\n"
tempFile.writelines(outText)
with codecs.open(tempFilePath, "r", "utf-8", "ignore") as tempFile2:
myLinesFromDateRange = tempFile2.readlines()
#delete the temporary file that was created
os.remove(tempFilePath)
if len(myLinesFromDateRange) > 0:
createFolder("\\filtered_data_files\\")
outFilename = "windows_" + channel.lower() + "_event_viewer_logs" + ext
myLinesFromDateRange.sort()
#remove duplicate records from the list
myFinalLinesFromDateRange = list(set(myLinesFromDateRange))
myFinalLinesFromDateRange.sort()
with codecs.open(os.getcwd() + "\\filtered_data_files\\" + outFilename, "a+", "utf-8", "ignore") as linesFromDateRange:
linesFromDateRange.seek(0)
if len(linesFromDateRange.read(100)) > 0:
linesFromDateRange.writelines("\n")
linesFromDateRange.writelines(myFinalLinesFromDateRange)
del myLinesFromDateRange[:]
del myFinalLinesFromDateRange[:]
else:
print("No data was found within the specified date range.")
print("Closing: " + filenameWithExt)
I hope it helps you or someone else in the future.
EDIT:
The "tempFilePath" can be anything you want, for example:
tempFilePath = os.getcwd() + "\\tempFile.txt"
I collected some information first before calling the "evtxFile" function:
The "From" and the "To" dates are in the following format: YYYY-MM-DD
Converted the dates to "date" data type:
_fromDate = datetime.strptime(fromDate, "%Y-%m-%d").date()
_toDate = datetime.strptime(toDate, "%Y-%m-%d").date()
Divided the directory where the .evtx files are located into different parts:
def splitDirectory(root, file):
absolutePathOfFile = os.path.join(root, file)
filePathWithoutFilename = os.path.split(absolutePathOfFile)[0]
filenameWithExt = os.path.split(absolutePathOfFile)[1]
filenameWithoutExt = os.path.splitext(filenameWithExt)[0]
extension = os.path.splitext(filenameWithExt)[1]
return absolutePathOfFile, filePathWithoutFilename, filenameWithExt, filenameWithoutExt, extension
for root, subFolders, files in os.walk(directoryPath):
for f in files:
absolutePathOfFile, filePathWithoutFilename, filenameWithExt,
filenameWithoutExt, extension = splitDirectory(root, f)
if extension == ".evtx":
evtxFile(absolutePathOfFile, filenameWithExt, ".txt", _fromDate, _toDate)

Related

How to export dict list to csv cleanly?

I have a script I'm writing to make pulling data from my fantasy football league easy and exported in a format that can be played with in Excel easily.
The script I have attached only contains the relevant parts to this questions as the larger script I have written has a lot of moving parts that doesn't apply here.
I'm essentially pulling this players.get_all_players() data from the Sleeper platform using the Sleeper-API-Wrapper (Github link here).
My script will take player data and put it into a .csv like this, with the player ID in the top row and all the info in a single cell below the ID. Screenshot of this below.
Excel .csv screenshot
How can I export this so that the data is nicely formatted into separate rows? I have a different spreadsheet where I'd like to be able to pull this data to automatically.
Alternatively, if I'm doing this in a really roundabout way, please let me know! This is the JSON response from the platform: JSON Response
# 9 All players - players.get_all_players()
warning = 1
while warning == 1:
print("%s%s\n\n\nWARNING:%s" % (fg(15), bg(9), attr(0)))
print("%s%sthe 'all players' option is intensive and may freeze your PC for several minutes.%s" % (fg(15), bg(0), attr(1)))
warning = input("continue anyway? (y/n)\n")
if warning == "n":
pe_loop = 0
action = 0
elif warning == "y":
name = "all players"; file = name
output = players.get_all_players()
break
else:
print("Not a valid option, try again.")
warning = 1
overwrite = 0
name_change = 0
while action == 0:
try:
action = int(input("%s%s\n1 - print\n2 - export to Excel\n3 - back to tasks\n4 - end\n--------------------\n%s" % (fg(14), bg(0), attr(1))))
except ValueError:
print("Not a valid option, try again.")
## Print
if action == 1 and week != 18:
print(output)
break
elif action == 1 and week == 18:
week = 0
while week < 18:
week += 1
if task == 3:
output = league.get_matchups(week)
elif task == 4:
output = league.get_transactions(week)
print(output)
## Export
elif action == 2:
path = os.path.join(parent_dir, file)
name_change = input("\nDo you want to name the file? (y/n)\n")
if name_change == "y":
name = input("\nEnter file name now:\n")
if name_change == "n":
file_path = path + "\\" + name + '_' + str(year) + ".xlsx"
if os.path.isfile(file_path) == True:
overwrite = input("\nFile name... '" + name + "' already exists! Would you like to overwrite this file? (y/n)\n")
if overwrite == "n":
count = 0
while os.path.isfile(file_path) == True:
count += 1
new_name = name + "_" + str(count)
file_path = path + "\\" + new_name + ".xlsx"
else:
name = new_name
print("\nThe new file was automatically named: " + new_name + "_wk" + str(week) + "\nand placed in: " + path)
if os.path.isdir(path) == False and overwrite == 0:
os.mkdir(path)
print("\nCreating new file path... " + file + "\n")
elif os.path.isdir(path) == True and overwrite == 0:
print("\nFile path... '" + file + "' exists!\n")
toCSV = output
# 9 All Players CSV file
with open(parent_dir + file + "\\" + name + ".csv", 'w', encoding='utf8', newline='') as output_file:
fc = csv.DictWriter(output_file, output.keys())
fc.writeheader()
fc.writerow(toCSV)
It turns out that sleeper_wrapper exposes a method players.get_players_df that gives you a pandas DataFrame containing all players.
Write that to a csv file using to_csv as suggested in the comments.
Strip down your code to receive better answers faster :)
This is the code that your question needs:
from sleeper_wrapper import Players
import csv
players = Players()
toCSV = players.get_all_players()
with open(parent_dir + file + "\\" + name + ".csv", 'w', encoding='utf8', newline='') as output_file:
fc = csv.DictWriter(output_file, output.keys())
fc.writeheader()
fc.writerow(toCSV)
This is how you write the csv using pandas:
import pandas as pd
from sleeper_wrapper import Players
players = Players()
all_players = players.get_all_players()
# stolen from https://github.com/NotTheCrocHunter/sleeper-api-wrapper/blob/91d8cf1b64cf55884b4c4746d53ccd1259d11c1f/sleeper_wrapper/players.py#L41
# because that method is unavailable in the version of sleeper_wrapper in PyPI
all_players_df = pd.DataFrame.from_dict(all_players, orient="index")
# all_players_df contains some information on teams as well, maybe you want to filter that out...
all_players_df.to_csv("your_output_file.csv")

Finding text string with pfdminer not consistent [Python]

I've got a question about a code that's getting text string from a pdf file and returns the output in a .csv
The output is stored in Output.csv. Like you can see it returns value on p.27 here the code works and 29, p. 28 is missing. What i want to return is textstring on p. 28 code not working.
Can somebody tell me what im doing wrong? In the 2nd code pdfminer does read out the proper output that is needed.
import re, csv, os
import sys, time
from tqdm import tqdm
import multiprocessing as mp
from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
# method 3: object oriented programming
class Program:
#initialisation (happens when Program() is called for the first time)
def __init__(self):
# locations
# this defines the location of the workspace and directory of the data to process
self.ws_loc = Path("C:/Users/pco/Desktop/workspace")
self.dat_loc = Path("C:/Users/pco/Desktop/workspace/data/Test")
# lookuptable
# this converts the lookuptable from maximo to a list which can be used for comparison
self.lu_file = self.ws_loc / "lookuptable.csv"
with open(self.lu_file, newline='') as f:
reader = csv.reader(f)
self.lu_list = list(filter(None,list(reader)))
self.lu_list = [each[0] for each in self.lu_list]
def listener(self,q):
'''listens for messages on the q (queue), writes (appends) to file (output.csv). '''
# open output.csv in location workspace/data/ and use as 'f'
with open(self.ws_loc / 'output.csv', 'a') as f:
#start infinite listening loop until 'kill' message is received
while 1:
# get the message which is first in q (queue)
m = q.get()
# break loop if message is kill and close file 'output.csv'
if m == 'kill':
f.close()
break
# if message is not 'kill' then write message to file and flush file
f.write(m)
f.flush()
def worker(self, file, q):
''' processes a pdf file given by main() and writes output to q (queue)'''
# init PDF class (this class is used to get pages from the PDF and process pdftext)
PDF = self.PDF(self.dat_loc,self.lu_list,0)
# get all the pages from PDF: contains pages = [page1, ..., pageN]
# pageN = "bla bla \n bla etc."
PDFpages = PDF.getPages(file)
pages = []
for page in PDFpages:
pages.append(page)
# varargs defines extra data for files (this is where metadata is stored)
# varargs should not be filled here, but it is initialized here.
varargs = ''
# check if file is a manual (this can be seen as an example for a varargs entry)
# it should contain atleast ',' (this creates a new column entry in the csv)
# PDF.fileCategory() which is a class within the Program class, can be taken as an example
varargs+= PDF.fileCategory(file,pages) + ',' + PDF.fileSupplier(file, pages) + ',' + PDF.fileRev(file, pages)
# new vararg can be added like: varargs+= THE_VARARG
# initialise pageNum (which is a page number identifier inside the for loop)
pageNum = 1
# create an empty datastack (which is the message that will be send to q (queue))
datastack = ''
# for each page do...
for page in pages:
'''!!! for each page look for tags (THIS IS WHERE THE REGEX HAPPENS PDF.find_tag()) !!!'''
found_strings, found = PDF.find_tag(page)
# found_stringsrev, foundrev = PDF.find_rev(page)
# if tags are found, then fix the tags such that they are correct with
# Program.putStripe() (or self.putStripe()) it changes 12AB1234A to 12-AB-1234-A
# if foundrev:
# string = ''
# fixedstring = ''
# for stringrev in found_stringsrev:
# # fill datastack with found tags
# datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + ',' + stringrev + ',' + varargs + '\n'
if found:
for string in found_strings:
# if correct, do not change
fixedstring = string
# check if the tag matches the correct regexpression ('regex' or 're')
if re.match('^(\d{1,2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', string)!=None:
# else fix the tag
fixedstring = self.putStripe(string)
# fill datastack with found tags
datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + varargs + '\n'
# next page, so pageNum becomes pageNum + 1
pageNum +=1
# if the datastack is empty, we are still interested in the varargs:
# (so empty tag columns are added)
if datastack=='':
datastack = file + ',' + ',' + ',' + varargs + '\n'
# put the datastack message inside of the q (queue)
q.put(datastack)
# terminate the PDF class so that the pdf file is closed in a correct way
PDF.terminate()
# return (in case the datastack should be printed)
return datastack
def putStripe(self,input):
'''This function fixes a tag that is not correct'''
# strip the tag from spaces
input = re.sub(' ','',input)
# for each string that matches the expression write to words
words = re.findall('[0-9][A-Za-z]+', input)
words += re.findall('[A-Za-z][0-9]+', input)
# for each match inside the tag add a '-' in the second position
for word in words:
i = input.find(word)+1
input = input[:i] + '-' + input[i:]
# return the fixed tag
return input
def main(self):
try:
# initiate time
t = time.time()
# create pools for paralell pooling (max cpu threads is optained automatically)
pool = mp.Pool(mp.cpu_count() + 2)
# create a manager
manager = mp.Manager()
# from the pool manager create a queue object which can be used to
# exchange data between the worker and listener
q = manager.Queue()
# start up listener first
# ignore warning, it is being used
watcher = pool.apply_async(self.listener, (q,))
# fire off workers (basically assign them jobs)
jobs = []
# NOTE: FOR LOOPS DO NOT CAUSE A LOOP, CODE PROCEEDS WITH PARALLEL THREADING
# AS IF THE RESULT OF EACH LOOP IS INSTANTLY COMPLETED
# each file in the data location is a job
for file in os.listdir(self.dat_loc):
# assign the job to a worker
job = pool.apply_async(self.worker, (file, q))
# append the job to jobs (for data aquisition)
jobs.append(job)
# this is used to get the data back from jobs
for job in tqdm(jobs):
#print('')
#print(job.get()[:-1])
job.get()
# printed elapsed time (good for project management)
print('elapsed time = ' + str(time.time()-t) + ' seconds')
# catch interupt and try to properly terminate workers (might take time)
# best to just do everything in batches and dont interrupt
except KeyboardInterrupt:
print("\nCaught KeyboardInterrupt, terminating workers")
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
pool.terminate()
SystemExit(1)
# always excecute (kills workers and listener)
finally:
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
def execute(self):
self.main()
class PDF:
# from PDF.
def __init__(self,dat_loc,lu_list,maxpages):
self.dat_loc = dat_loc
self.lu_list = lu_list
self.lu_list_f = 0
self.password = ""
self.maxpages = maxpages
self.caching = True
self.rsrcmgr = PDFResourceManager()
self.retstr = StringIO()
self.laparams = LAParams()
self.device = TextConverter(self.rsrcmgr, self.retstr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.pagenos=set()
# from PDF.
def getPages(self,file):
self.fp = open(self.dat_loc / file, 'rb')
pages = PDFPage.get_pages(self.fp,
self.pagenos,
maxpages=self.maxpages,
password=self.password,
caching=self.caching,
check_extractable=True)
return pages
# from PDF.
def fileCategory(self,file,pages):
rules = []
rules.append(['Manual',['ANLAGE - INSTALLATION','User Guide','MANUAL','Manual','manual','Handleiding','handleiding','Instruction','instructions','Instructie', 'Guide', 'GUIDE']])
rules.append(['Specification',['SPECIFICATION','Specification','Specificatie']])
rules.append(['Datasheet',['DATA BOOK','UTILITIES LIST','DATA PACKAGE','Data Package','data-sheet','Datasheet','DATASHEET','datasheet','DATA SHEET','Data Sheet','Data sheet','data sheet']])
rules.append(['Spare part list',['SPARE PARTS LIST']])
rules.append(['Invoice',['BILL OF MATERIAL','invoice','Invoice','INVOICE','Purchase order','Purchase Order','PURCHASE ORDER']])
rules.append(['Schematic Diagram',['SCHEMATIC DIAGRAM','Schematic Diagram','Schematic diagram', 'ISOMETRIC', 'Isometric', 'isometric']])
rules.append(['Checklist', ['Checklist', 'CHECKLIST', 'CHECKSHEET', 'Checksheet']])
rules.append(['Certificates', ['Certificate', 'CERTIFICATE', 'Zertifikat', 'ZERTIFIKAT', 'Certificat', 'CERTIFICAT']])
rules.append(['Required documents list', ['REQUIRED SUBMITTAL DOCUMENTS']])
fileCategory = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileCategory += ',' + category
else:
fileCategory += ',' + 'Unreadable'
return fileCategory
# from PDF.
def fileSupplier(self,file,pages):
rules = []
rules.append(['JE Jacobs',['JE Jacobs', 'JE JACOBS', 'Jacobs', 'JACOBS']])
rules.append(['Emerson',['Emerson', 'Emerson Process Management', 'EMERSON',]])
rules.append(['Air Liquide',['Air Liquide', 'AIR LIQUIDE']])
rules.append(['Rosemount',['ROSEMOUNT', 'Rosemount']])
rules.append(['Deltak',['Deltak', 'DELTAK']])
rules.append(['AviComp',['AVICOMP', 'Avicomp', 'avicomp']])
fileSupplier = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileSupplier += ',' + category
else:
fileSupplier += ',' + 'Supplier N/A'
return fileSupplier
# from PDF.
def fileRev(self,file,pages):
fileRev = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for line in self.pagestr(page).splitlines():
if re.match('^(Rev.*).*$', line):
found = True
if found:
break
if found:
break
counter+=1
if found:
fileRev += ',' + line
else:
fileRev += ',' + ''
return fileRev
# from PDF.
def find_string_lookup(self,page,pageNum,file,varargs):
datastack = []
data = []
found = False
for line in self.pagestr(page).splitlines():
line = re.sub('[^A-Za-z0-9]+', '', line)
counter = 0
for tag in self.lu_list_f:
if line.find(tag)!=-1:
found = True
data = file + ',' + str(self.lu_list[counter][0]) + ',' + str(pageNum) + varargs +'\n'
if data not in datastack:
datastack += [data]
counter+=1
return datastack, found
# from PDF.
def find_string(self,page,strings,Method=None):
datastack = []
data = []
found = False
if Method=='ALPHABET_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line) for line in strings]
elif Method=='ALPHABETCAPS_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line).upper() for line in strings]
elif Method=='ALPHABETCAPS':
tags = [line.upper() for line in strings]
else:
tags = strings
for line in self.pagestr(page).splitlines():
if Method=='ALPHABET_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line)
elif Method=='ALPHABETCAPS_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line).upper()
elif Method=='ALPHABETCAPS':
line = line.upper()
i = 0
for tag in tags:
if tag != '':
if line.find(tag)!=-1:
found = True
data = strings[i]
if data not in datastack:
datastack += [data]
i+=1
return datastack, found
# from PDF.
def find_tag(self,page):
datastack = []
found = False
for line in self.pagestr(page).splitlines():
tags = re.findall('^(\d{2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', line)
for tag in tags:
if tag not in datastack:
datastack += [tag]
found = True
return datastack, found
# from PDF.
# def find_rev(self,page):
# datastack = []
# found = False
# for line in self.pagestr(page).splitlines():
# tags = re.findall('^(Rev.*).*$', line)
# for tag in tags:
# if tag not in datastack:
# datastack += [tag]
# found = True
# return datastack, found
# from PDF.
def pagestr(self,page):
self.retstr.truncate(0)
self.retstr.seek(0)
self.interpreter.process_page(page)
return self.retstr.getvalue()
# from PDF.
def terminate(self):
self.fp.close()
self.device.close()
self.retstr.close()
# start the code (the proper way)
if __name__ == '__main__':
Program().execute()
If i read out the pdf with this code in python (also with pdfminer):
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
class glb():
workspace_folder = Path('C:/Users/pco/Desktop/workspace')
data_folder = Path('C:/Users/pco/Desktop/workspace/data/Test')
lookup_file = workspace_folder / "lookuptable.csv"
with open(lookup_file, newline='') as f:
reader = csv.reader(f)
lookup_list = list(reader)
lookup_list_filtered = list(filter(None,[re.sub('[^A-Za-z0-9]+', '', str(line)) for line in lookup_list]))
def find_tagnumbers(path):
pagelines = []
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
page_no = 1
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
page_str = retstr.getvalue()
pagelines.append(page_str.splitlines())
retstr.truncate(0)
retstr.seek(0)
page_no +=1
page_no +=-1
print(pagelines)
fp.close()
device.close()
retstr.close()
return 1
find_tagnumbers('C:/Users/pco/Desktop/workspace/data/Test/1845613_1_27_Marked.pdf')
it does returns 47-AT -0053. But if i run the code below it doesn't return the value in output file. Output when i print pagelines
p.s. my coding skills is beginner (so i write out all the steps)

Script cant write .txt file. Pythonanywhere

I have a simple script for yandex.metrika counters goals logging. It writes when goal was created or deleted to txt file. Code for writing to text file:
if cID == 18662179:
with open('toyota_goalss_log.txt','a') as log2:
print(str(datetime.date.today()) +str(res2), file = log2)
print(str(datetime.date.today()) +str(res2),cID)
log2.close()
The script runs correctly if I click the button "run" in editor on pythonanywhere: No errors, the data is appended to the text files. But if I create a task to run this script every hour the data does not get appended to the text files... and there are no errors in task log or errors log, too. What did I do wrong?
More code:
#!/usr/bin/python3.6
import requests
import datetime
from pprint import pprint
import time
def goalsS():
token = 'AQAAAAAFKNk4AAPquxxxxxxxxx'
headers = {'Authorization': 'OAuth ' + token}
countersDict = {18662179:'site.ru', 901167:'site.ru'}
counterIds = [18662179, 901167]
for cID in counterIds:
names = []
ng=[]
url = "https://api-metrika.yandex.net/management/v1/counter/"+str(cID)+"/goals"
r = requests.get(url, headers=headers)
res = r.json()['goals']
for i in res:
ng.append(str(i['id'])+": "+ i['name']+'|')
names.append(i['name'])
goalsDict = dict(zip(ng,names))
clear = str(ng).replace('[','').replace(']','').replace("'",'').replace(',','')
with open(str(cID)+'goals_log.log','a') as log:
print(clear, file = log)
log.close()
li = []
f = open(str(cID)+'goals_log.log', 'r')
for line in f:
line = set(line.rstrip("\n").split('|'))
li.append(line)
res2 = li[-1] - li[-2]
if res2 == set():
res2 = li[-2]-li[-1]
print(res2,'set')
if res2 == set():
pass
else:
if cID == 18662179:
with open('toyota_goalss_log.txt','a') as log2:
print(str(datetime.date.today()) + ' ' + 'Удалили цель(и)'+' '+str(res2).replace(',','').replace('{','').replace('}',''),file = log2)
print(str(datetime.date.today()) + ' ' + 'Удалили цель\цели'+' '+str(res2),cID)
log2.close()
else:
if cID == 18662179:
with open('toyota_goalss_log.txt','a') as log2:
print(str(datetime.date.today()) + ' ' + 'Создали цель(и)'+' '+str(res2).replace(',','').replace('{','').replace('}',''),file = log2)
print(str(datetime.date.today()) + ' ' + 'Создали цель\цели'+' '+str(res2),cID)
log2.close()
if __name__ == '__main__':
goalsS()
Use absolute path to the file.
with open('/path/to/file','a') as log2:
...

How to make a file transfer program in python

The title might not be relevant for my question becuase I don't actually want a wireless file transfering script, I need a file manager type.
I want something with which I can connect my phone with my pc (eg: hotspot and wifi) and then I would like to show text file browser (I have the code for that) by sending lists of all files and folders using os.listdir(), whenever the selected option is a file (os.path.isdir() == False), I would like to transfer the file and run it(like: picture, video, etc).
The file Browser code which I wrote runs on windows and also Android (after making a few changes) using qpython.
My code is
import os
def FileBrowser(cwd = os.getcwd()):
while True:
if cwd[-1:] != "\\":
cwd = cwd + "\\"
files = os.listdir(cwd)
count = 1
tmpp = ""
print("\n\n" + "_"*50 +"\n\n")
print(cwd + "\n")
for f in files:
if os.path.isdir(cwd + f) == True:
s1 = str(count) + ". " + f
tmps1 = 40 - (len(s1)+5)
t2 = int(tmps1/3)
s1 = s1 + " " * t2 + "-" * (tmps1 - t2)
print(s1 + "<dir>")
else:
print(str(count) + ". " + f + tmpp)
count = count + 1
s = raw_input("Enter the file/Directory: ")
if s == "...":
tmp1 = cwd.count("\\")
tmp2 = cwd.rfind("\\")
if tmp1 > 1:
cwd = cwd[0:tmp2]
tmp2 = cwd.rfind("\\")
cwd = cwd[0:tmp2+1]
continue
else:
continue
else:
s = int(s) - 1
if os.path.isdir(cwd + files[s]) == True:
cwd = cwd + files[s] + "\\"
continue
else:
f1 = files[s]
break
return f1
def main():
fb = FileBrowser()
main()
A very naive approach using Python is to go to the root of the directory you want to be served and use:
python -m SimpleHTTPServer
The connect to it on port 8000.
you may need to socket programming. creating a link (connection) between your PC and you smart phone and then try to transfer files

separate line output by groups

My python script checks mysqldump and if any problems script prints :
Dump is old for db;
Dump is not complete for db;
Dump is empty for db;
MySQL dump does not exist for db;
Script logs these records to the file line by line.
My question is there are a way to format output in the file like:
Dump is old for db;
Dump is old for db;
Dump is old for db;
Dump is not complete for db;
Dump is not complete for db;
Dump is not complete for db;
Dump is empty for db;
Dump is empty for db;
Dump is empty for db;
Because now my file looks like:
Dump is old for db;
Dump is empty for db;
Dump is old for db;
MySQL dump does not exist for db;
...
etc
Here my small script :)
#!/bin/env python
import psycopg2
import sys,os
from subprocess import Popen, PIPE
from datetime import datetime
import smtplib
con = None
today = datetime.now().strftime("%Y-%m-%d")
log_dump_fail = '/tmp/mysqldump_FAIL'
log_fail = open(log_dump_fail,'w').close()
log_fail = open(log_dump_fail, 'a')
sender = 'PUT_SENDER_NAME_HERE'
receiver = ['receiver_name']
smtp_daemon_host = 'localhost'
def db_backup_file_does_not_exist(db_backup_file):
if not os.path.exists(db_backup_file): return True
else: return False
def dump_health(last_dump_row, file_name,db):
last_row = last_dump_row.rsplit(" ")
tms = ''.join(last_row[4:5])
status = last_row[1:3]
if (status) and (tms != today):
log_fail.write("\nDB is old for "+ str(db) + str(file_name) + ", \nDump finished at " + str(''.join(tms)))
log_fail.write("\n-------------------------------------------")
elif not (status) and (tms == None):
log_fail.write("\nDump is not complete for "+str(db) + str(file_name) + " , end of file is not correct")
log_fail.write("\n-------------------------------------------")
suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
def humansize(nbytes):
if nbytes == 0: return '0 B'
i = 0
while nbytes >= 1024 and i < len(suffixes)-1:
nbytes /= 1024.
i += 1
f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
return '%s %s' % (f, suffixes[i])
def dump_size(dump_file, file_name,db):
size = os.path.getsize(dump_file)
if (size < 1024):
human_readable = humansize(size)
log_fail.write("\nDump is empty for " +str(db) + "\n" +"\t" + str (file_name)+", file size is " + str(human_readable))
log_fail.write("\n-------------------------------------------")
def report_to_noc(isubject,text):
TEXT = text
SUBJECT = subject
message = 'Subject: %s\n\n%s' % (SUBJECT, TEXT)
server = smtplib.SMTP(smtp_daemon_host)
server.sendmail(sender, receiver, message)
server.quit()
try:
con = psycopg2.connect(database='**', user='***', password='***', host='****')
cur = con.cursor()
cur.execute("""\
select ad.servicename, (select name from servers where id = ps.server_id) as servername
from packages as p, account_data as ad, package_servers as ps
where p.id=ad.package_id and
p.date_deleted IS NULL and
p.id=ps.package_id and
p.aktuel IS NULL and
p.pre_def_package_id = 4 and
p.mother_package_id !=0 and
ps.subservice_id=5 and
p.mother_package_id NOT IN (select id from packages where date_deleted IS NOT NULL)
ORDER BY servername;
""")
while (1):
row = cur.fetchone ()
if row == None:
break
db = row[0]
server_name = str(row[1])
if (''.join(server_name) == 'SKIP_THIS') or (''.join(server_name) == 'SKIP_THIS'):
continue
else:
db_backup_file = '/storage/backup/db/mysql/' + str(db) + '/current/' + str(db) + '.mysql.gz'
db_backup_file2 = '/storage/backup/' + str(''.join(server_name.split("DB"))) + '/mysql/' + str(db) + '/current/'+ str(db) + '.mysql.gz'
db_file_does_not_exist = False
db_file2_does_not_exist = False
if db_backup_file_does_not_exist(db_backup_file):
db_file_does_not_exist = True
if db_backup_file_does_not_exist(db_backup_file2):
db_file2_does_not_exist = True
if db_file_does_not_exist and db_file2_does_not_exist:
log_fail.write("\nMySQL dump does not exist for " + str(db) + "\n" + "\t" + str(db_backup_file2) + "\n" + "\t" + str(db_backup_file))
log_fail.write("\n-------------------------------------------")
continue
elif (db_file_does_not_exist) and not (db_file2_does_not_exist):
p_zcat = Popen(["zcat", db_backup_file2], stdout=PIPE)
p_tail = Popen(["tail", "-2"], stdin=p_zcat.stdout, stdout=PIPE)
dump_status = str(p_tail.communicate()[0])
dump_health(dump_status,db_backup_file2,db)
dump_size(db_backup_file2, db_backup_file2,db)
elif (db_file2_does_not_exist) and not (db_file_does_not_exist):
p_zcat = Popen(["zcat", db_backup_file], stdout=PIPE)
p_tail = Popen(["tail", "-2"], stdin=p_zcat.stdout, stdout=PIPE)
dump_status = str(p_tail.communicate()[0])
dump_health(dump_status,db_backup_file,db)
dump_size(db_backup_file,db_backup_file,db)
con.close()
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
log_fail.close()
if os.path.getsize(log_dump_fail) > 0:
subject = "Not all MySQL dumps completed successfully. Log file backup:" + str(log_dump_fail)
fh = open(log_dump_fail, 'r')
text = fh.read()
fh.close()
report_to_noc(subject,text)
else:
subject = "MySQL dump completed successfullyi for all DBs, listed in PC"
text = "Hello! \nI am notifying you that I checked mysqldump files this morning.\nThere are nothing to worry about. :)"
report_to_noc(subject,text)
You can process your log file after it has been written.
One option is to read your file and sort the lines:
lines = open('log.txt').readlines()
lines.sort()
open('log_sorted.txt', 'w').write("\n".join(lines))
This won't emit an empty line between log types.
Another option is to use a Counter:
from collections import Counter
lines = open('log.txt').readlines()
counter = Counter()
for line in lines:
counter[line] += 1
out_file = open('log_sorted.txt', 'w')
for line, num in counter.iteritems():
out_file.write(line * num + "\n")
Looks like you want to group the output of the script, rather than log the info as it comes while searching.
Easiest would be to maintain 4 lists, on each for empty, not empty and so on. In the script add the db names to appropriate list instead of logging, and then dump the lists one by one into the file with appropriate prefixes("not empty for" + dbname).
For example, remove all the log_fail.write() from the functions and replace them with list.append() and write a separate function that writes to the log file as you like:
Add lists:
db_dump_is_old_list = []
db_dump_is_empty_list = []
db_dump_is_not_complete_list = []
db_dump_does_not_exist_list = []
Modify the Functions:
def dump_health(last_dump_row, file_name,db):
last_row = last_dump_row.rsplit(" ")
tms = ''.join(last_row[4:5])
status = last_row[1:3]
if (status) and (tms != today):
db_dump_is_old_list.append(str(db))
#log_fail.write("\nDB is old for "+ str(db) + str(file_name) + ", \nDump finished at " + str(''.join(tms)))
#log_fail.write("\n-------------------------------------------")
elif not (status) and (tms == None):
db_dump_is_not_complete_list.append(str(db)
#log_fail.write("\nDump is not complete for "+str(db) + str(file_name) + " , end of file is not correct")
#log_fail.write("\n-------------------------------------------")
def dump_size(dump_file, file_name,db):
size = os.path.getsize(dump_file)
if (size < 1024):
human_readable = humansize(size)
db_dump_is_empty_list.append(str(db))
#log_fail.write("\nDump is empty for " +str(db) + "\n" +"\t" + str (file_name)+", file size is " + str(human_readable))
#log_fail.write("\n-------------------------------------------")
if db_file_does_not_exist and db_file2_does_not_exist:
db_dump_does_not_exist_list.append(str(db))
#log_fail.write("\nMySQL dump does not exist for " + str(db) + "\n" + "\t" + str(db_backup_file2) + "\n" + "\t" + str(db_backup_file))
#log_fail.write("\n-------------------------------------------")
continue
And add a logger function:
def dump_info_to_log_file():
log_dump_fail = '/tmp/mysqldump_FAIL'
log_fail = open(log_dump_fail,'w').close()
log_fail = open(log_dump_fail, 'a')
for dbname in db_dump_is_old_list:
log_fail.write("Dump is Old for" + str(dbname))
log_fail.write("\n\n")
for dbname in db_dump_is_empty_list:
log_fail.write("Dump is Empty for" + str(dbname))
log_fail.write("\n\n")
for dbname in db_dump_is_not_complete_list:
log_fail.write("Dump is Not Complete for" + str(dbname))
log_fail.write("\n\n")
for dbname in db_dump_does_not_exist_list:
log_fail.write("Dump Does Not Exist for" + str(dbname))
log_fail.close()
Or you could simply log as you are doing, and then read in the file, sort and write back the file.
Thank you all for all interesting ideas.
I have really tried all options :)
To my mind:
With Counter object the pros is to few lines of code.
But cons are - many read\write operations. Log file is not big, however, I decided to decrease read(s) \ write(s)
With array the cons are to many lines of code :) but the pros is - write to the file only once.
So I implemented arrays.. :)
Thank you guys!!!

Categories