I have files with CommonChar is some of them and my python code works on them to build a dictionary. While building there are some required keys which users might forget to put in. The code should be able to flag the file and the key which is missing.
The syntax for python code to work on is like this:
CommonChar pins Category General
CommonChar pins Contact Mark
CommonChar pins Description 1st line
CommonChar pins Description 2nd line
CommonChar nails Category specific
CommonChar nails Description 1st line
So for above example "Contact" is missing:
CommonChar nails Contact Robert
I have a list for ex: mustNeededKeys=["Category", "Description", "Contact"]
mainDict={}
for dirName, subdirList, fileList in os.walk(sys.argv[1]):
for eachFile in fileList:
#excluding file names ending in .swp , swo which are creatied temporarily when editing in vim
if not eachFile.endswith(('.swp','.swo','~')):
#print eachFile
filePath= os.path.join(dirName,eachFile)
#print filePath
with open(filePath, "r") as fh:
contents=fh.read()
items=re.findall("CommonChar.*$",contents,re.MULTILINE)
for x in items:
cc, group, topic, data = x.split(None, 3)
data = data.split()
group_dict = mainDict.setdefault(group, {'fileLocation': [filePath]})
if topic in group_dict:
group_dict[topic].extend(['</br>'] + data)
else:
group_dict[topic] = data
This above code does its job of building a dict like this:
{'pins': {'Category': ['General'], 'Contact': ['Mark'], 'Description': ['1st', 'line', '2nd', 'line'] } , 'nails':{'Category':['specific'], 'Description':['1st line']}
So when reading each file with CommonChar and building a group_dict , a way to check all the keys and compare it with mustNeededKeys and flag if not there and proceed if met.
Something like this should work:
# Setup mainDict (equivalent to code given above)
mainDict = {
'nails': {
'Category': ['specific'],
'Description': ['1st', 'line'],
'fileLocation': ['/some/path/nails.txt']
},
'pins': {
'Category': ['General'],
'Contact': ['Mark'],
'Description': ['1st', 'line', '</br>', '2nd', 'line'],
'fileLocation': ['/some/path/pins.txt']
}
}
# check for missing keys
mustNeededKeys = {"Category", "Description", "Contact"}
for group, group_dict in mainDict.items():
missing_keys = mustNeededKeys - set(group_dict.keys())
if missing_keys:
missing_key_list = ','.join(missing_keys)
print(
'group "{}" ({}) is missing key(s): {}'
.format(group, group_dict['fileLocation'][0], missing_key_list)
)
# group "nails" (/some/path/nails.txt) is missing key(s): Contact
If you must check for missing keys immediately after processing each group, you could use the code below. This assumes that each group is stored as a contiguous collection of rows in a single file (i.e., not mixed with other groups in the same file or spread across different files).
from itertools import groupby
mainDict={}
mustNeededKeys = {"Category", "Description", "Contact"}
for dirName, subdirList, fileList in os.walk(sys.argv[1]):
for eachFile in fileList:
# excluding file names ending in .swp , swo which are created
# temporarily when editing in vim
if not eachFile.endswith(('.swp','.swo','~')):
#print eachFile
filePath = os.path.join(dirName,eachFile)
#print filePath
with open(filePath, "r") as fh:
contents = fh.read()
items = re.findall("CommonChar.*$", contents, re.MULTILINE)
split_items = [line.split(None, 3) for line in items]
# group the items by group name (element 1 in each row)
for g, group_items in groupby(split_items, lambda row: row[1]):
group_dict = {'fileLocation': [filePath]}
# store all items in the current group
for cc, group, topic, data in group_items:
data = data.split()
if topic in group_dict:
group_dict[topic].extend(['</br>'] + data)
else:
group_dict[topic] = data
# check for missing keys
missing_keys = mustNeededKeys - set(group_dict.keys())
if missing_keys:
missing_key_list = ','.join(missing_keys)
print(
'group "{}" ({}) is missing key(s): {}'
.format(group, filePath, missing_key_list)
)
# add group to mainDict
mainDict[group] = group_dict
data = '''CommonChar pins Category General
CommonChar pins Contact Mark
CommonChar pins Description 1st line
CommonChar pins Description 2nd line
CommonChar nails Category specific
CommonChar nails Description 1st line'''
from collections import defaultdict
from pprint import pprint
required_keys = ["Category", "Description", "Contact"]
d = defaultdict(dict)
for line in data.splitlines():
line = line.split()
if line[2] == 'Description':
if line[2] not in d[line[1]]:
d[line[1]][line[2]] = []
d[line[1]][line[2]].extend(line[3:])
else:
d[line[1]][line[2]] = [line[3]]
pprint(dict(d))
print('*' * 80)
# find missing keys
for k in d.keys():
for missing_key in set(d[k].keys()) ^ set(required_keys):
print('Key "{}" is missing "{}"!'.format(k, missing_key))
Prints:
{'nails': {'Category': ['specific'], 'Description': ['1st', 'line']},
'pins': {'Category': ['General'],
'Contact': ['Mark'],
'Description': ['1st', 'line', '2nd', 'line']}}
********************************************************************************
Key "nails" is missing "Contact"!
Related
How do I extract text from this PDF files where some data is in the form of table while some are key value based data
eg:
https://drive.internxt.com/s/file/78f2d73478b832b2ab55/3edb275967deeca6ad33e7d53f2337c50d5dfb50e0aa525bb7f10d49dff1e2b4
This is what I have tried :
import PyPDF2
import openpyxl
from openpyxl import Workbook
pdfFileObj = open('sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
wb = Workbook()
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext
wb.save('sample.xlsx')
print('Save')
However I'd like the data to be stored in the following format.
This pdf does not have well defined tables, hence cannot use any tool to extract the entire data in one table format. What we can do is read the entire pdf as text. And process each data fields line by line by using regex to extract the data.
Before you move ahead, please install the pdfplumber package for python
pip install pdfplumber
Assumptions
Here are some assumptions that I made for your pdf and accordingly I have written the code.
First line will always contain the title Account History Report.
Second line will contain the names IMAGE All Notes
Third line will contain only the data Date Created in the form of key:value.
Fourth line will contain only the data Number of Pages in the form of key:value.
Fifth line will only contain the data Client Code, Client Name
Starting line 6, a pdf can have multiple data entity, these data entity for eg in this pdf is 2 but can be any number of entity.
Each data entity will contain the following fields:
First line in data entity will contain only the data Our Ref, Name, Ref 1, Ref 2
Second line line will only contain data in the form as present in pdf Amount, Total Paid, Balance, Date of A/C, Date Received
Third line in data entity will contain the data Last Paid, Amt Last Paid, Status, Collector.
Fourth line will contain the column name Date Notes
The subsequent lines will contain data in the form of table until the next data entity is started.
I also assume that each data entity will contain the first data with key Our Ref :.
I assume that the data entity will be separated on the first line of each entity in the pattern of key values as Our Ref :Value Name: Value Ref 1 :Value Ref 2:value
pattern = r'Our Ref.*?Name.*?Ref 1.*?Ref 2.*?'
Please note that the rectangle that I have created(thick black) in above image, I am calling those as data entity.
The final data will be stored in a dictionary(json) where the data entity will have key as dataentity1, dataentity2, dataentity3 based on the number of entities you have in your pdf.
The header details are stored in the json as key:value and I assume that each key will be present in header only once.
CODE
Here is the simple elegant code, that gives you information from the pdf in the form of json. In the output the first few field contains information from the header part, subsequent data entities can be found as data_entity 1 and 2.
In the below code all you need to change is pdf_path.
import pdfplumber
import re
# regex pattern for keys in line1 of data entity
my_regex_dict_line1 = {
'Our Ref' : r'Our Ref :(.*?)Name',
'Name' : r'Name:(.*?)Ref 1',
'Ref 1' : r'Ref 1 :(.*?)Ref 2',
'Ref 2' : r'Ref 2:(.*?)$'
}
# regex pattern for keys in line2 of data entity
my_regex_dict_line2 = {
'Amount' : r'Amount:(.*?)Total Paid',
'Total Paid' : r'Total Paid:(.*?)Balance',
'Balance' : r'Balance:(.*?)Date of A/C',
'Date of A/C' : r'Date of A/C:(.*?)Date Received',
'Date Received' : r'Date Received:(.*?)$'
}
# regex pattern for keys in line3 of data entity
my_regex_dict_line3 ={
'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
'A/C Status': r'A/C\s+Status:(.*?)Collector',
'Collector' : r'Collector :(.*?)$'
}
def preprocess_data(data):
return [el.strip() for el in data.splitlines() if el.strip()]
def get_header_data(text, json_data = {}):
header_data_list = preprocess_data(text)
# third line in text of header contains Date Created field
json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
# fourth line in text contains Number of Pages, Client Code, Client Name
json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
# fifth line in text contains Client Code and ClientName
json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
def iterate_through_regex_and_populate_dictionaries(data_dict, regex_dict, text):
''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
for key, regex in regex_dict.items():
matched_value = re.search(regex, text)
if matched_value is not None:
data_dict[key] = matched_value.group(1).strip()
def populate_date_notes(data_dict, text):
''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
data_dict['Date'] = []
data_dict['Notes'] = []
iter = 4
while(iter < len(text)):
date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
data_dict['Date'].append(date_match.group(1).strip())
notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
data_dict['Notes'].append(notes_match.group(1).strip())
iter += 1
data_index = 1
json_data = {}
pdf_path = r'C:\Users\hpoddar\Desktop\Temp\sample3.pdf' # ENTER YOUR PDF PATH HERE
pdf_text = ''
data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
if(__name__ == '__main__'):
with pdfplumber.open(pdf_path) as pdf:
index = 0
while(index < len(pdf.pages)):
page = pdf.pages[index]
pdf_text += '\n' + page.extract_text()
index += 1
split_on_data_entity = re.split(data_entity_sep_pattern, pdf_text.strip())
# first data in the split_on_data_entity list will contain the header information
get_header_data(split_on_data_entity[0], json_data)
while(data_index < len(split_on_data_entity)):
data_entity = {}
data_processed = preprocess_data(split_on_data_entity[data_index])
iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
populate_date_notes(data_entity, data_processed)
json_data['data_entity' + str(data_index)] = data_entity
data_index += 1
print(json_data)
Output :
Result string :
{'Date Created': '18/04/2022', 'Number of Pages': '4', 'Client Code': '110203', 'ClientName': 'AWS PTE. LTD.', 'data_entity1': {'Our Ref': '2118881115', 'Name': 'Sky Blue', 'Ref 1': '12-34-56789-2021/2', 'Ref 2': 'F2021004444', 'Amount': '$100.11', 'Total Paid': '$0.00', 'Balance': '$100.11', 'Date of A/C': '01/08/2021', 'Date Received': '10/12/2021', 'Last Paid': '', 'Amt Last Paid': '', 'A/C Status': 'CLOSED', 'Collector': 'Sunny Jane', 'Date': ['04/03/2022'], 'Notes': ['Letter Dated 04 Mar 2022.']}, 'data_entity2': {'Our Ref': '2112221119', 'Name': 'Green Field', 'Ref 1': '98-76-54321-2021/1', 'Ref 2': 'F2021001111', 'Amount': '$233.88', 'Total Paid': '$0.00', 'Balance': '$233.88', 'Date of A/C': '01/08/2021', 'Date Received': '10/12/2021', 'Last Paid': '', 'Amt Last Paid': '', 'A/C Status': 'CURRENT', 'Collector': 'Sam Jason', 'Date': ['11/03/2022', '11/03/2022', '08/03/2022', '08/03/2022', '21/02/2022', '18/02/2022', '18/02/2022'], 'Notes': ['Email for payment', 'Case Status', 'to send a Letter', '845***Ringing, No reply', 'Letter printed - LET: LETTER 2', 'Letter sent - LET: LETTER 2', '845***Line busy']}}
Now once you got the data in the json format, you can load it in a csv file, as a data frame or whatever format you need the data to be in.
Save as xlsx
To save the same in a xlsx file in the format as shown in the image in the question above. We can use xlsx writer to do the same.
Please install the package using pip
pip install xlsxwriter
From the previous code, we have our entire data in the variable json_data, we will be iterating through all the data entities and write the data to appropriate cell specified by row, col in the code.
import xlsxwriter
workbook = xlsxwriter.Workbook('Sample.xlsx')
worksheet = workbook.add_worksheet("Sheet 1")
row = 0
col = 0
# write columns
columns = ['Account History Report', 'All Notes'] + [ key for key in json_data.keys() if 'data_entity' not in key ] + list(json_data['data_entity1'].keys())
worksheet.write_row(row, col, tuple(columns))
row += 1
column_index_map = {}
for index, col in enumerate(columns):
column_index_map[col] = index
# write the header
worksheet.write(row, column_index_map['Date Created'], json_data['Date Created'])
worksheet.write(row, column_index_map['Number of Pages'], json_data['Number of Pages'])
worksheet.write(row, column_index_map['Client Code'], json_data['Client Code'])
worksheet.write(row, column_index_map['ClientName'], json_data['ClientName'])
data_entity_index = 1
#iterate through each data entity and for each key insert the values in the sheet
while True:
data_entity_key = 'data_entity' + str(data_entity_index)
row_size = 1
if(json_data.get(data_entity_key) != None):
for key, value in json_data.get(data_entity_key).items():
if(type(value) == list):
worksheet.write_column(row, column_index_map[key], tuple(value))
row_size = len(value)
else:
worksheet.write(row, column_index_map[key], value)
else:
break
data_entity_index += 1
row += row_size
workbook.close()
Result :
The above code creates a file sample.xlsx in the working directory.
I have many txt files with the standard format in the same folder:
Name: 321;
Score:100; Used Time: 1:09:308;
GTime: 6/28/2024 10:04:18 PM;
Core Version : 21.0.0.0;
Software Version : 21.0.0.0;
AppID: 0S0; MapDispName: Future City; MapName:MapName123;
Key:A0000-abcde-Q0000-F0000-00H00; REG Date : 2/27/2021 1:16:34 PM; Expiry : 7/7/2024 12:00:00 AM
I would like to convert those text files into an excle.xls (table) using a python script. At the same time, I would like to save text filenames as well.
Team ID, Result, Used Time,Software Ver, Core Ver, AppID, Key, REG Date, Expiry,MapName,TXTName
321,100,69.308s,21.0.0.0,21.0.0.0,0S0,A0000-abcde-Q0000-F0000-00H00,2/27/2021 1:16:34 PM,7/7/2024 12:00:00 AM,MapName123,TXTName1
Part of my code as below, but it's not working. TypeError: unsupported operand type(s) for +: 'dict' and 'str' in the penultimate line.
list_drr=[]
xls_name=None
for path, file_dir, files in os.walk(file_name_path):
for file_name in files:
list_drr.append(os.path.join(path, file_name))
for dir in file_dir:
list_drr.append(os.path.join(path, dir))#
excel_data= ExcelData(xls_name,"")
excel_datas= excel_data.readExcel()
print(excel_datas)
excel_header=['Team ID', 'Result', 'Used Time', 'Software Ver', 'Core Version', 'AppID', 'Key', 'REG Date', 'Expiry','MapName','TXTName']
file= WFile(excel_header)
for drr in list_drr:
file_datas= getFileData(drr)
file_datas=file_datas[:7]
data_list=[]
for data in file_datas:
lis= data.split(";")
for li in lis:
data_list.append(li)
data_dic={}
for data in data_list:
pos= data.find(":")
ddq=data[:pos].strip()
data_dic[ddq]=data[pos+1:].strip()
file.write((data_dic) + (os.path.basename(file_name)))
file.save("excel.xls")
Please advise what should I do, thanks.
Updated wri_file.py as below.
import xlwt
class WFile():
def __init__(self,head_name):
super().__init__()
self.head_name=head_name
self.index=0
self. workbook =xlwt. Workbook(encoding='utf-8')
self. worksheet = self.workbook.add_sheet('Sheet1')
self.line =len( self.head_name)
for i in range( self.line):
self.worksheet.write( self.index,i ,self.head_name[i])
self.index+=1
def write(self,d):
for i in range( self.line):
name=d.get(self.head_name[i])
self.worksheet.write( self.index,i ,(name))
self.index += 1
def writes(self, d):
for i in range(self.line):
self.worksheet.write(self.index, i,d[i])
self.index += 1
def save(self,name):
self.workbook.save(name)
You are trying to add file_name as another entry to be written. As you have constructed a dictionary, a better approach would be to add a new key value pair before writing:
data_dic['TXTName'] = os.path.basename(drr)
file.write(data_dic)
As your output header is not consistent with the keys in your text files, you need to create a dictionary to map between the two, e.g. field_mapping
You were also storing directory names into your list of file? A simpler approach is to just use glob.glob() which also can work recursively and also just on .txt files if needed.
import glob
import xlwt
# {Your header : Key in text file}
field_mapping = {
'Team ID' : 'Name',
'Result' : 'Score',
'Used Time' : 'Used Time',
'Core Version' : 'Core Version',
'Software Ver' : 'Software Version',
'MapName' : 'MapName',
'AppID' : 'AppID',
'Key' : 'Key',
'REG Date' : 'REG Date',
'Expiry' : 'Expiry',
'TXTName' : 'TXTName',
}
def getFileData(drr):
with open(drr) as f_input :
return f_input.readlines()
class WFile():
def __init__(self,head_name):
super().__init__()
self.head_name = head_name
self.index = 0
self.workbook = xlwt.Workbook(encoding='utf-8')
self.worksheet = self.workbook.add_sheet('Sheet1')
self.line = len(self.head_name)
for i in range(self.line):
self.worksheet.write(self.index, i, self.head_name[i])
self.index += 1
def write(self, d):
for i, v in enumerate(self.head_name):
name = d.get(field_mapping[v])
self.worksheet.write(self.index, i, (name))
self.index += 1
def save(self,name):
self.workbook.save(name)
file_name_path = r"\my file\location\*.txt" # or just * for all files
excel_header = ['Team ID', 'Result', 'Used Time', 'Software Ver', 'Core Version', 'AppID', 'Key', 'REG Date', 'Expiry','MapName','TXTName']
file = WFile(excel_header)
for file_name in glob.glob(file_name_path, recursive=True):
file_datas = getFileData(file_name)
file_datas = file_datas[:7]
data_list = []
for data in file_datas:
for li in data.split(";"):
data_list.append(li)
data_dic = {}
for data in data_list:
pos = data.find(":")
ddq = data[:pos].strip()
data_dic[ddq] = data[pos+1:].strip()
data_dic['TXTName'] = os.path.basename(file_name)
file.write(data_dic)
file.save("excel.xls")
Giving you an old style output .xls file of:
You should consider using the newer .xlsx format, with a library such as openpyxl.
import openpyxl
from itertools import islice
import glob
import os
field_mapping = {
'Name' : 'Team ID',
'Score' : 'Result',
'Used Time' : 'Used Time',
'Core Version' : 'Core Ver',
'Software Version' : 'Software Ver',
'AppID' : 'AppID',
'Key' : 'Key',
'REG Date' : 'REG Date',
'Expiry' : 'Expiry',
'TXTName' : 'TXTName'
}
wb = openpyxl.Workbook()
ws = wb.active
ws.append(list(field_mapping.values())) # write a header
for filename in glob.glob('/my folder/*.txt', recursive=True):
key_values = {}
with open(filename) as f_input:
for field in ''.join(islice(f_input, 0, 7)).replace('\n', '').split(';'):
key, value = map(str.strip, field.split(':', 1)) # only split on the first colon
key_values[field_mapping.get(key, None)] = value
key_values['TXTName'] = os.path.basename(filename)
output_row = [value for key, value in key_values.items() if key] # Ignore key=None
ws.append(output_row)
wb.save('excel.xlsx')
so I would like to from a input.txt file, create a two dictionaries
for example, here is sample of the input.txt file
#. VAR #first=Billy
#. VAR #last=Bob
#. PRINT VARS
#. VAR #petName=Gato
#. VAR #street="1234 Home Street"
#. VAR #city="New York"
#. VAR #state=NY
#. VAR #zip=21236
#. VAR #title=Dr.
#. PRINT VARS
#. FORMAT LM=5 JUST=LEFT
#. PRINT FORMAT
so VAR #varName=value
i.e in the case of #first=Billy you would get something like varDict = {"first": "Billy"} right?
Now I wanna know how to do that thru the entire file
There are two dictionaries that I would need to populate, one for the variables, and one for FORMAT, which just holds values, doesn't actually do anything for now.
As far as a desired output, In the input file, there are commands that when read, will trigger to either add variables to the directory, or print that directory, or add to the format directory. I would use the pprint function like this pprint.pprint(varDict , width=30) and would output something like this
{'first': 'Billy',
'last': 'Bob'}
{'city': 'New York',
'first': 'Billy',
'last': 'Bob',
'petName': 'Gato',
'state': 'NY',
'street': '1234 Home Street',
'title': 'Dr.',
'zip': '21236'}
{'BULLET': 'o',
'FLOW': 'YES',
'JUST': 'LEFT',
'LM': '5',
'RM': '80'}
Unfortunately i keep getting errors all over the place on the driver and source file
AttributeError: 'list' object has no attribute 'groups'
TypeError: expected string or buffer
Driver.py
input=(sys.argv[1])
# Group 1. VAR
# Group 2. #first=Mae or JUST=RIGHT FLOW=NO
# pass Group 2 as atString
regexSearch = re.compile(r'^#. ([A-Z]+) (.*)', re.MULTILINE)
regexPrintVAR = re.compile(r'^#\.\s*PRINT\s(VARS)', re.MULTILINE)
regexPrintFORMAT = re.compile(r'^#\.\s*PRINT\s(FORMAT)',re.MULTILINE)
regexERRCheck = re.compile(r'^#\.\s*FORMAT\s+BAD', re.MULTILINE)
varDictionary = dict()
formatDictionary = {"FLOW":"YES", "LM":"1", "RM":"80","JUST":"LEFT","BULLET":"o"}
file = open(input, "r")
while True:
inputLine = file.readline()
matchObj = regexSearch.search(inputLine)
command, atString = matchObj.groups()
if command == "VAR":
setVariable(atString,varDictionary)
if command == "FORMAT":
formatListERR = regexERRCheck.search(inputLine)
if formatListERR != None:
print("*** Not a recognizable command")
line = file.readline()
setFormat(atString, formatDictionary)
if command == "PRINT":
printVARObj = regexPrintVAR.search(inputLine)
printFormatObj = regexPrintFORMAT.search(inputLine)
if printVARObj != None:
pprint.pprint(varDictionary, width=30)
elif printFormatObj != None:
pprint.pprint(formatDict, width=30)
inputLine = file.readline()
file.close()
importFileIUse.py
# The atString is the remainder of the string after the VAR or FORMAT key word.
varDictionary = dict()
formatDictionary = {"FLOW":"YES", "LM":"1", "RM":"80","JUST":"LEFT","BULLET":"o"}
def setFormat(atString,formatDictionary):
regexFormat = re.compile(r'((?:(?:\w+)=(?:\w+)\s*)*)$')
line = re.split(" +", atString)
formatList = regexFormat.search(line)
if formatList:
for param in formatList[0].split():
splitParam = param.split('=')
formatDictionary[splitParam[0]] = splitParam[1]
def setVariable (atString, varDictionary):
regexVAR = re.compile(r'#(\w+)=(\w+|.*)\s*$', re.MULTILINE)
# file = open(input)
# line = file.readline()
# line = re.split(" +", atString)
#while line:
varList = regexVAR.findall(atString)
for key, value in varList:
varDictionary[key] = value
I'm trying to build a cvs file that has all Active directory fields I need, from 2 external files.
the first file has a list of users that need to be created and some other info relevant to an AD object
and the second report is a list of exported SamAccountName and emails dumped from AD. So what I want to do is create a unique SamAccountName, I form my test SamaAccountName from the firstname and lastname of the first report and wan to compare it vs the second report. I'm currently storing ins alist all the data I get from the second report and I want to check my generated SamAccountName exists in that list
so far I'm not able to do so and only get a csv with the SamAccoutnNames I made up( it does not do the check )
note: I can't use any other plugin to check directly to Active Directory
import csv
def getSamA(fname, lname):
Sams = []
sama = lname[0:5] + fname[0:2]
with open('test-input.txt','r') as AD:
rows = csv.DictReader(AD)
for ad in rows:
Sams.append(ad['SamAccountName'])
#check if built sama is in list
if sama in Sams:
#if sama in list, add one more character to sama
sama = lname[0:5] + fname[0:3]
return sama.lower()
else:
return sama.lower()
with open('users.csv') as csv_file:
rows = csv.DictReader(csv_file)
with open('users-COI2-Multi.csv', 'w', newline='') as output:
header = ['FirstName','Initials','LastName','DisplayName','Description','Office','TelePhone','UserLogonName','SamAccountName','JobTitle','Department','Manager','Mobile','faxNumber','Notes','Assistant','employeeID','ex1','ex2','ex3','ex15','Office365License','ExpiresInDays','EmailToUSer','AddToGroup']
output_file = csv.DictWriter(output, fieldnames=header, delimiter=';')
output_file.writeheader()
for data in rows:
employeeId = data['Associate ID']
fName = data['First Name']
lName = data['Last Name']
Location = data['Location']
Department = data['Department']
Manager = data['Manager Name']
JobTitle = data['Title']
context = {
'FirstName' : fName,
'Initials' : getInitials(fName, lName),
'LastName' : lName,
'DisplayName' : getDisplayName(fName, lName),
'Description' : 'Account for: '+getDisplayName(fName, lName),
'Office': getOffice(Location).strip(),
'TelePhone' : '+1 XXX XXX XXXX',
'UserLogonName' : getMail(fName, lName),
'SamAccountName' : getSamA(fName, lName),
'JobTitle' : JobTitle,
'Department' : Department,
'Manager' : Manager,
'Mobile' : '',
'faxNumber' : '',
'Notes' : '',
'Assistant' : '',
'employeeID' : employeeId,
'ex1' : 'End User',
'ex2' : 'NoMailbox',
'ex3' : getSiteCode(Location),
'ex15' : getSKID(Location),
'Office365License' : '',
'ExpiresInDays' : '',
'EmailToUSer' : 'user#test.com',
'AddToGroup' : '',
}
output_file.writerow(context)
; commentary
[owner]
name=Justin Case
organization=Chilling Inc.
[database]
; more commentary
server=192.0.0.1
port=123
file=something.csv
[third section]
attribute=value,
that extends to
the third line,
but not the fourth
Given the above ini contents, have to construct a dictionary such that
{'owner' : {'name' : 'Justin Case','organization' : 'Chilling Inc.'},
'database' : {'server' : '192.0.0.1', 'port' : '123', 'file' : 'something.csv'},
'third section' : {'attribute' : 'multiline value'}}
I realize there is the configuration file parser, but not allowed for this assignment.
Progress at the moment:
with open('ini.txt', encoding='utf8') as data:
lines = [row for row in data]
lines_nocom = []
for row in lines:
if not row.startswith(';'):
lines_nocom.append(row)
dictt = {}
I removed the rows with commentary in them since they are unnecesary.
How can I make python recognize the sections and their respective attributes?
i.e section1 could have 2 attributes and section2 could have any number of attributes
If I do [row for row in lines_nocom] then how does it recognize where one section ends and another begins?
How to make python recognize a multiline value?
Track the current section and add your keys to that; each time you find a line using square brackets create a new section.
For continuation lines, do something similar; track the last used name:
with open('ini.txt', encoding='utf8') as data:
section = None # current section
name = None # current name being stored
result = {}
for line in data:
line = line.strip()
if not line or line.startswith(';'):
# skip comments and empty lines
continue
if line.startswith('[') and line.endswith(']'):
# new section
section_name = line.strip('[]')
section = result[section_name] = {}
continue
# add entries to the existing section
if '=' in line:
name, _, value = line.partition('=')
name = name.strip()
section[name] = value.strip()
else:
# adding to last-used name
section[name] += ' ' + line
Demo:
>>> from io import StringIO
>>> from pprint import pprint
>>> sample = StringIO('''\
... ; commentary
... [owner]
... name=Justin Case
... organization=Chilling Inc.
...
... [database]
... ; more commentary
... server=192.0.0.1
... port=123
... file=something.csv
...
... [third section]
... attribute=value,
... that extends to
... the third line,
... but not the fourth
... ''')
>>> section = None # current section
>>> name = None # current name being stored
>>> result = {}
>>> for line in sample:
... line = line.strip()
... if not line or line.startswith(';'):
... # skip comments and empty lines
... continue
... if line.startswith('[') and line.endswith(']'):
... # new section
... section_name = line.strip('[]')
... section = result[section_name] = {}
... continue
... # add entries to the existing section
... if '=' in line:
... name, _, value = line.partition('=')
... name = name.strip()
... section[name] = value.strip()
... else:
... # adding to last-used name
... section[name] += ' ' + line
...
>>> pprint(result)
{'database': {'file': 'something.csv', 'port': '123', 'server': '192.0.0.1'},
'owner': {'name': 'Justin Case', 'organization': 'Chilling Inc.'},
'third section': {'attribute': 'value, that extends to the third line, but '
'not the fourth'}}