convert contents of metadata file into variables list - python

Hi I m wanting to convert the contents of a file (in this case a Landsat 7 metadata file) into a series of variables defined by the contents of the file using Python 2.7. The file contents looks like this:
GROUP = L1_METADATA_FILE
GROUP = METADATA_FILE_INFO
ORIGIN = "Image courtesy of the U.S. Geological Survey"
REQUEST_ID = "0101305309253_00043"
LANDSAT_SCENE_ID = "LE71460402010069SGS00"
FILE_DATE = 2013-06-02T11:19:59Z
STATION_ID = "SGS"
PROCESSING_SOFTWARE_VERSION = "LPGS_12.2.1"
DATA_CATEGORY = "NOMINAL"
END_GROUP = METADATA_FILE_INFO
GROUP = PRODUCT_METADATA
DATA_TYPE = "L1T"
ELEVATION_SOURCE = "GLS2000"
OUTPUT_FORMAT = "GEOTIFF"
EPHEMERIS_TYPE = "DEFINITIVE"
SPACECRAFT_ID = "LANDSAT_7"
SENSOR_ID = "ETM"
SENSOR_MODE = "BUMPER"
WRS_PATH = 146
WRS_ROW = 040
DATE_ACQUIRED = 2010-03-10
GROUP = IMAGE_ATTRIBUTES
CLOUD_COVER = 0.00
IMAGE_QUALITY = 9
SUN_AZIMUTH = 137.38394502
SUN_ELEVATION = 48.01114126
GROUND_CONTROL_POINTS_MODEL = 55
GEOMETRIC_RMSE_MODEL = 3.790
GEOMETRIC_RMSE_MODEL_Y = 2.776
GEOMETRIC_RMSE_MODEL_X = 2.580
END_GROUP = IMAGE_ATTRIBUTES
Example of interested variable items:
GROUP = MIN_MAX_RADIANCE
RADIANCE_MAXIMUM_BAND_1 = 293.700
RADIANCE_MINIMUM_BAND_1 = -6.200
RADIANCE_MAXIMUM_BAND_2 = 300.900
RADIANCE_MINIMUM_BAND_2 = -6.400
RADIANCE_MAXIMUM_BAND_3 = 234.400
RADIANCE_MINIMUM_BAND_3 = -5.000
RADIANCE_MAXIMUM_BAND_4 = 241.100
RADIANCE_MINIMUM_BAND_4 = -5.100
RADIANCE_MAXIMUM_BAND_5 = 47.570
RADIANCE_MINIMUM_BAND_5 = -1.000
RADIANCE_MAXIMUM_BAND_6_VCID_1 = 17.040
RADIANCE_MINIMUM_BAND_6_VCID_1 = 0.000
RADIANCE_MAXIMUM_BAND_6_VCID_2 = 12.650
RADIANCE_MINIMUM_BAND_6_VCID_2 = 3.200
RADIANCE_MAXIMUM_BAND_7 = 16.540
RADIANCE_MINIMUM_BAND_7 = -0.350
RADIANCE_MAXIMUM_BAND_8 = 243.100
RADIANCE_MINIMUM_BAND_8 = -4.700
END_GROUP = MIN_MAX_RADIANCE
I am open to other ideas as I don't need all entries as variables, just a selection. And I see some headers are listed more than once. i.e. GROUP is used multiple times. I need to be able to select certain variables (integer values) and use in formulas in other areas of code. ANY help would be appreciated (novice python coder).

I'm not sure exactly what you are looking for, but maybe something like this:
s = '''GROUP = L1_METADATA_FILE
GROUP = METADATA_FILE_INFO
ORIGIN = "Image courtesy of the U.S. Geological Survey"
REQUEST_ID = "0101305309253_00043"
LANDSAT_SCENE_ID = "LE71460402010069SGS00"
FILE_DATE = 2013-06-02T11:19:59Z
STATION_ID = "SGS"
PROCESSING_SOFTWARE_VERSION = "LPGS_12.2.1"
DATA_CATEGORY = "NOMINAL"
END_GROUP = METADATA_FILE_INFO
GROUP = PRODUCT_METADATA
DATA_TYPE = "L1T"
ELEVATION_SOURCE = "GLS2000"
OUTPUT_FORMAT = "GEOTIFF"
EPHEMERIS_TYPE = "DEFINITIVE"
SPACECRAFT_ID = "LANDSAT_7"
SENSOR_ID = "ETM"
SENSOR_MODE = "BUMPER"
WRS_PATH = 146
WRS_ROW = 040
DATE_ACQUIRED = 2010-03-10'''
output = {} #Dict
for line in s.split("\n"): #Iterates through every line in the string
l = line.split("=") #Seperate by "=" and put into a list
output[l[0].strip()] = l[1].strip() #First word is key, second word is value
print output #Output is a dictonary containing all key-value pairs in your metadata seperated by "="
print output["SENSOR_ID"] #Outputs "ETM"
==============
Edited:
f = open('metadata.txt', 'r') #open file for reading
def build_data(f): #build dictionary
output = {} #Dict
for line in f.readlines(): #Iterates through every line in the string
if "=" in line: #make sure line has data as wanted
l = line.split("=") #Seperate by "=" and put into a list
output[l[0].strip()] = l[1].strip() #First word is key, second word is value
return output #Returns a dictionary with the key, value pairs.
data = build_data(f)
print data["IMAGE_QUALITY"] #prints 9

Related

pylucence cannot find a word that was presented in the text which indexed earlier

I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document:
(The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.

Appending to list replaces all items in the list with most recent item

So this is what I have. What I get at the end I have a list of every element in the formattedList the same. But when I print features[i] it prints the right features that I want to add to the end of the list.
# function to change modified json list back to dict so it can be saved as a json file again
def makeDict(jsonlist):
features = [{}] * 364
geoid1 = [{}] * 364
geometry = [{}] * 364
properties = [{}] * 364
formattedList = []
tempList = []
keylistfeatures = ['geometry', 'properties']
keylistgeometry = ['coordinates', 'type']
keylistproperties = ['content', 'datatype', 'density', 'density', 'description', 'display', 'file', 'size', 'source'
, 'target']
keylistfile = ['date', 'name', 'size']
file = {}
size = []
density = []
for i in range(0, 364):
# make density list
density = [jsonlist[i][6], jsonlist[i][7]]
# size list
size = [jsonlist[i][8], jsonlist[i][9]]
# file dictionary
file['date'] = jsonlist[i][1]
file['name'] = jsonlist[i][0]
file['size'] = jsonlist[i][2]
# geometry dictionary
geometry[i]['coordinates'] = jsonlist[i][11]
geometry[i]['type'] = jsonlist[i][12]
# properties dictionary
properties[i]['content'] = jsonlist[i][5]
properties[i]['datatype'] = jsonlist[i][4]
properties[i]['density'] = density
properties[i]['description'] = jsonlist[i][10]
properties[i]['display'] = jsonlist[i][3]
properties[i]['file'] = file
properties[i]['size'] = size
properties[i]['source'] = jsonlist[i][14]
properties[i]['target'] = jsonlist[i][15]
features[i]['geometry'] = geometry[i]
features[i]['properties'] = properties[i]
features[i]['type'] = jsonlist[i][13]
# print(features[i])
formattedList.append(features[i])
# print(formattedList)
return formattedList
Been stuck on this for hours but my guess is it's something simple I don't understand.
When I use the debugger, here is what I see
1st iteration
2nd iteration
3rd iteration
4th iteration
And also features is always a list of 364 of the same thing for that iteration
features

Parse a Generated File Python

I'm trying to parse generated files into a list of objects.
Unfortunately the structure of the generated files is not always the same, but they contain the same fields (and lots of other garbage).
For example:
function foo(); # Don't Care
function maybeanotherfoo(); # Don't Care
int maybemoregarbage; # Don't Care
product_serial = "CDE1102"; # I want this <---------------------
unnecessary_info1 = 10; # Don't Care
unnecessary_info2 = "red" # Don't Care
product_id = 1134412; # I want this <---------------------
unnecessary_info3 = "88" # Don't Care
product_serial = "DD1232"; # I want this <---------------------
product_id = 3345111; # I want this <---------------------
unnecessary_info1 = "22" # Don't Care
unnecessary_info2 = "panda" # Don't Care
product_serial = "CDE1102"; # I want this <---------------------
unnecessary_info1 = 10; # Don't Care
unnecessary_info2 = "red" # Don't Care
unnecessary_info3 = "bear" # Don't Care
unnecessary_info4 = 119 # Don't Care
product_id = 1112331; # I want this <---------------------
unnecessary_info5 = "jj" # Don't Care
I want a list of objects (each object has: serial and id).
I have tried the following:
import re
class Product:
def __init__(self, id, serial):
self.product_id = id
self.product_serial = serial
linenum = 0
first_string = "product_serial"
second_string = "product_id"
with open('products.txt', "r") as products_file:
for line in products_file:
linenum += 1
if line.find(first_string) != -1:
product_serial = re.search('\"([^"]+)', line).group(1)
#How do I proceed?
Any advice would be greatly appreciated!
Thanks!
I've inlined the data here using an io.StringIO(), but you can substitute data for your products_file.
The idea is that we gather key/values into current_object, and as soon as we have all the data we know we need for a single object (the two keys), we push it onto a list of objects and prime a new current_object.
You could use something like if line.startswith('product_serial') instead of the admittedly complex regexp.
import io
import re
data = io.StringIO("""
function foo();
function maybeanotherfoo();
int maybemoregarbage;
product_serial = "CDE1102";
unnecessary_info1 = 10;
unnecessary_info2 = "red"
product_id = 1134412;
unnecessary_info3 = "88"
product_serial = "DD1232";
product_id = 3345111;
unnecessary_info1 = "22"
unnecessary_info2 = "panda"
product_serial = "CDE1102";
unnecessary_info1 = 10;
unnecessary_info2 = "red"
unnecessary_info3 = "bear"
unnecessary_info4 = 119
product_id = 1112331;
unnecessary_info5 = "jj"
""")
objects = []
current_object = {}
for line in data:
line = line.strip() # Remove leading and trailing whitespace
m = re.match(r'^(product_id|product_serial)\s*=\s*(\d+|"(?:.+?)");?$', line)
if m:
key, value = m.groups()
current_object[key] = value.strip('"')
if len(current_object) == 2: # Got the two keys we want, ship the object
objects.append(current_object)
current_object = {}
print(objects)

How to convert text table to dataframe

I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)

How to Create a 3 Level List in Python 2.7

How to I create a 3 level list in python 2.7? See Image of Nested Lists HERE
Setup:
I have the following inputs:
Files(s)
Categories
I then Search through each file, and iterate through each category to get every Value.
I just can't get the desired output.
# Combine Multiple Lists
LinkDoc = ["File1", "File2", "File3"]
cat = ["Walls", "Doors", "Columns"]
rvt1_wall = ["W1_rvt1", "W2_rvt1"]
rvt1_door = ["D1_rvt1", "D2_rvt1", "D3_rvt1", "D4_rvt1"]
rvt1_col = ["C1_rvt1"]
rvt2_wall = ["W2_rvt2", "W2_rvt2"]
rvt2_door = ["D2_rvt2", "D2_rvt2", "D3_rvt2", "D4_rvt2"]
rvt2_col = ["C2_rvt2"]
rvt3_wall = ["W3_rvt3", "W2_rvt3"]
rvt3_door = ["D3_rvt3", "D2_rvt3", "D3_rvt3", "D4_rvt3"]
rvt3_col = ["C3_rvt3"]
rvt4_wall = ["W4_rvt4", "W2_rvt4"]
rvt4_door = ["D4_rvt4", "D2_rvt4", "D3_rvt4", "D4_rvt4"]
rvt4_col = ["C4_rvt4"]
all_comb1 = [rvt1_wall,rvt1_door,rvt1_col]
all_comb2 = [rvt2_wall,rvt2_door,rvt2_col]
all_comb3 = [rvt3_wall,rvt3_door,rvt3_col]
all_comb4 = [rvt4_wall,rvt4_door,rvt4_col]
all_tot = [all_comb1, all_comb2, all_comb3, all_comb4]
xyz_pairs = [[x,y,z] for x in LinkDoc for y in cat for z in all_tot]
print(xyz_pairs)
print("========")
print("")

Categories