python set dict not exist, how can I handle it? - python

import SimpleITK as sitk
reader = sitk.ImageFileReader()
reader.SetFileName(filePath)
reader.ReadImageInformation()
img = reader.Execute()
meta = {
"a": reader.GetMetaData('0'), <- if not exist return 'undeinfed'
"b": reader.GetMetaData('1'),
"c": reader.GetMetaData('2'),
}
I am javascript developer.
I want to set meta dict and it shows error which is 'Key '0' does not exist'.
It can be not exist how can I set meta in this case?

From the docs, the ImageFileReader class has a HasMetaDataKey() boolean function. So you should be able to do something like this:
meta = {
"a": reader.GetMetaData('0') if reader.HasMetaDataKey('0') else 'undefined',
"b": reader.GetMetaData('1') if reader.HasMetaDataKey('1') else 'undefined',
"c": reader.GetMetaData('2') if reader.HasMetaDataKey('2') else 'undefined',
}
And you could do in one (long) line:
meta = {m: reader.GetMetaData(k) if reader.HasMetaDataKey(k) else 'undefined'
for m, k in zip(['a', 'b', 'c'], ['0', '1', '2'])}

you can use default dict
from collections import defaultdict
d = defaultdict(lambda : 'xx') #<- Whatever value you want
d[10] #no value passed value automatically assinged to xx
d[11]=12 #value 12 assinged
#to get value you can use d.get(key)
print(d[10]) #prints 'xx'
print(d)
outputs
defaultdict(<function <lambda> at 0x000001557B4B03A8>, {10: 'xx', 11: 12})
you get the idea you can modify according to your need

Related

Use beautifulsoup to scrape a table within a webpage?

I am scraping a county website that posts emergency calls and their locations. I have found success webscraping basic elements, but am having trouble scraping the rows of the table.
(Here is an example of what I am working with codewise)
location = list.find('div', class_='listing-search-item__sub-title')
Im not sure how to specifically webscrape the rows of the table. Can anyone explain how to dig into the sublevels of html to look for these records ? I'm not sure if I need to dig into tr, table, tbody, td, etc. Could use some guidance on which division or class to assign to dig into the data.
For extracting specific nested elements, I often prefer to use .select, which uses css selectors (bs4 doesn't seem to have any support for xpath but you can also check out these solutions using the lxml library), so for your case you could use something like
soup.select_one('table[id="form1:tableEx1"]').select('tbody tr')
but the results might look a bit weird since the columns might not be separated - to have separated columns/cells, you could get the of rows as tuples instead with
tableRows = [
tuple([c.text.strip() for c in r.find_all(['th', 'td'])]) for r
in BeautifulSoup(tHtml).select_one(
'table[id="form1:tableEx1"]'
).select('tbody tr')
]
(Note that you can't use the .select(#id) format when the id contains a ":".)
As one of the comments mentioned, you can use pandas.read_html(htmlString) to get a list of tables in the html; if you want a specific table, use the attrs argument:
# import pandas
pandas.read_html(htmlString, attrs={'id': 'form1:tableEx1'})[0]
but you will get the whole table - not just what's in tbody; and this will flatten any tables that are nested inside (see results with table used from this example).
And the single-statement method I showed at first with select cannot be used at all with nested tables since the output will be scrambled. Instead, if you want to preserve any nested inner tables without flattening, and if you are likely to be scraping tables often, I have the following set of functions which can be used in general:
first define two other function that the main table extractor depends on:
# get a list of tagNames between a tag and its ancestor
def linkAncestor(t, a=None):
aList = []
while t.parent != a or a is None:
t = t.parent
if t is None:
if a is not None: aList = None
break
aList.append(t.name)
return aList
# if a == t.parent: return []
# if a is None, return tagNames of ALL ancestors
# if a not in t.parents: return None
def getStrings_table(xSoup):
# not perfect, but enough for me so far
tableTags = ['table', 'tr', 'th', 'td']
return "\n".join([
c.get_text(' ', strip=True) for c in xSoup.children
if c.get_text(' ', strip=True) and (c.name is None or (
c.name not in tableTags and not c.find(tableTags)
))
])
then, you can define the function for extracting the tables as python dictionaries:
def tablesFromSoup(mSoup, mode='a', simpleOp=False):
typeDict = {'t': 'table', 'r': 'row', 'c': 'cell'}
finderDict = {'t': 'table', 'r': 'tr', 'c': ['th', 'td']}
refDict = {
'a': {'tables': 't', 'loose_rows': 'r', 'loose_cells': 'c'},
't': {'inner_tables': 't', 'rows': 'r', 'loose_cells': 'c'},
'r': {'inner_tables': 't', 'inner_rows': 'r', 'cells': 'c'},
'c': {'inner_tables': 't', 'inner_rows': 'r', 'inner_cells': 'c'}
}
mode = mode if mode in refDict else 'a'
# for when simpleOp = True
nextModes = {'a': 't', 't': 'r', 'r': 'c', 'c': 'a'}
mainCont = {
'a': 'tables', 't': 'rows', 'r': 'cells', 'c': 'inner_tables'
}
innerContent = {}
for k in refDict[mode]:
if simpleOp and k != mainCont[mode]:
continue
fdKey = refDict[mode][k] # also the mode for recursive call
innerSoups = [(
s, linkAncestor(s, mSoup)
) for s in mSoup.find_all(finderDict[fdKey])]
innerSoups = [s for s, la in innerSoups if not (
'table' in la or 'tr' in la or 'td' in la or 'th' in la
)]
# recursive call
kCont = [tablesFromSoup(s, fdKey, simpleOp) for s in innerSoups]
if simpleOp:
if kCont == [] and mode == 'c': break
return tuple(kCont) if mode == 'r' else kCont
# if not empty, check if header then add to output
if kCont:
if 'row' in k:
for i in range(len(kCont)):
if 'isHeader' in kCont[i]: continue
kCont[i]['isHeader'] = 'thead' in innerSoups[i][1]
if 'cell' in k:
isH = [(c[0].name == 'th' or 'thead' in c[1]) for c in innerSoups]
if sum(isH) > 0:
if mode == 'r':
innerContent['isHeader'] = True
else:
innerContent[f'isHeader_{k}'] = isH
innerContent[k] = kCont
if innerContent == {} and mode == 'c':
innerContent = mSoup.get_text(' ', strip=True)
elif mode in typeDict:
if innerContent == {}:
innerContent['innerText'] = mSoup.get_text(' ', strip=True)
else:
innerStrings = getStrings_table(mSoup)
if innerStrings:
innerContent['stringContent'] = innerStrings
innerContent['type'] = typeDict[mode]
return innerContent
With the same example as before, this function gives this output; if the simpleOp argument is set to True, it results in a simpler output, but then the headers are no longer differentiated and some other peripheral data is also excluded.

Dynamic list creation and append values - python

I have a input data that is parsed from a json and printing the output like this from keys like tablename,columnname,columnlength
data = ('tablename', 'abc.xyz'),('tablename','abc.xyz'),('columnname', 'xxx'),('columnname', 'yyy'),('columnlen', 55)
data[0] =
abc.xyz
abc.xyz
abc.xyz
data[1] =
xxx
yyy
zzz
data[2] =
20
30
60
data[0] represents tablename
data[1] represents columnname
data[2] represents column length
I have code below that does creating the empty list manually
TableName_list = []
ColumnName_list = []
ColumnLen_list = []
for x in data:
if x[0] == 'tablename':
TableName_list.append(data[0]])
elif x[0] == 'columnname':
ColumnName_list.append(data[1])
elif x[0] == 'columnlen':
ColumnLen_list.append(data[2])
I need to create a dynamic empty list respectively for each fields(tablename,column,columnlength) and append the data to that empty list in the dictionary
and my output is needed like this in a dictionary
dict = {'TableName':TableName_list,'ColumnName':ColumnName_list,'ColumnLen':columnLength_list }
This is probably most easily done with a defaultdict:
from collections import defaultdict
dd = defaultdict(list)
data = [
('tablename', 'abc.xyz'),('tablename','abc.xyz'),
('columnname', 'xxx'),('columnname', 'yyy'),
('columnlen', 55),('columnlen', 30)
]
for d in data:
dd[d[0]].append(d[1])
Output:
defaultdict(<class 'list'>, {
'tablename': ['abc.xyz', 'abc.xyz'],
'columnname': ['xxx', 'yyy'],
'columnlen': [55, 30]
})
If the case of the names in the result is important, you could use a dictionary to translate the incoming names:
aliases = { 'tablename' : 'TableName', 'columnname' : 'ColumnName', 'columnlen' : 'ColumnLen' }
for d in data:
dd[aliases[d[0]]].append(d[1])
Output:
defaultdict(<class 'list'>, {
'TableName': ['abc.xyz', 'abc.xyz'],
'ColumnName': ['xxx', 'yyy'],
'ColumnLen': [55, 30]
})
I suggest to make a dictionary directly, something look like this:
out_dict = {}
for x in data:
key = x[0]
if key in out_dict.keys():
out_dict[key] = out_dict[key].append(x[1])
else:
out_dict[key] = [x[1]]
using pandas:
import pandas as pd
>>> pd.DataFrame(data).groupby(0)[1].apply(list).to_dict()
'''
{'columnlen': [55, 30],
'columnname': ['xxx', 'yyy'],
'tablename': ['abc.xyz', 'abc.xyz']}

Iterate over all values in complex Dictionary

Goal: to add a ~tag to the tail end of any value, in a complex dictionary, with a % of occurrence.
Code works for "shallow" dictionaries (with no sub-dicts). I want to work with any complex dictionary.
Note: tag includes ~, if when it occurs.
Code:
import re
import random
RE_TAG = re.compile(r".+(~.+)")
DLM = '~'
tag_occurance = 25 # as %
thisdict = {
"Key1~tag": "foo",
"Key2": "bar",
"Key3~tag": {
"Key3.1": "x",
"Key3.2~tag": "y"
}
}
def tag(_str):
m = RE_TAG.match(_str)
if m:
return DLM + m[1][1:] # '~tag'
else:
return ''
# Main Process
thisdict = {key: val + tag(key) if random.randint(0, 100) < tag_occurance else val for key, val in thisdict.items()} # 25% tag
print(thisdict) # view difference
Error:
val is its own a dictionary, hence error.
Traceback (most recent call last):
File "./prog.py", line 25, in <module>
File "./prog.py", line 25, in <dictcomp>
TypeError: unsupported operand type(s) for +: 'dict' and 'str'
Desired Output:
{
"Key1~tag": "foo~tag", # tag added as postfix concatenated string
"Key2": "bar",
"Key3~tag": {
"Key3.1": "x",
"Key3.2~tag": "y~tag" # tag added as postfix concatenated string
}
}
Cause of Error
thisdict.values() returns the sub-dicts. I'm only interested in their actual sub-values.
print(thisdict.values())
>>> dict_values(['foo', 'bar', {'Key3.1~tag': 'x', 'Key3.2~tag': 'y'}])
Desired iteration:
['foo', 'bar', 'x', 'y']
Please let me know if there is anything else I can add to post.
One approach, as mentioned in the comments, is to write a recursive function:
def nested_tag(d):
res = {}
for key, value in d.items():
if isinstance(value, dict):
res[key] = nested_tag(value)
else:
res[key] = value + tag(key) if random.randint(0, 100) < tag_occurance else value
return res
final = nested_tag(this_dict)
print(final)
Output
{'Key1~tag': 'foo', 'Key2': 'bar', 'Key3~tag': {'Key3.1': 'x', 'Key3.2~tag': 'y~tag'}}
The above solution assumes the only complex values are dictionaries.

Python Cerberus how to check dynamic root keys

I have a dict with IDs as its root keys that I want to validate. In other words, the root keys of the dict I want to validate are dynamic. Is there a way to run keyschema against the root keys?
e.g. https://repl.it/#crunk1/cerberusrootkeys
import cerberus
v = cerberus.validator.Validator()
schema = {'keyschema': {'type': 'string'}}
d = {'foo': 'bar', 'baz': 'gaz'}
print('I want this to be true.')
print(v.validate(d, schema))
### Output:
# I want this to be true.
# False
I know I could do the following:
wrapper = {'nested': d}
schema = {'nested': {'keyschema': {'type': 'string'}}}
v.validate(wrapper, schema)
but the current structure of my project doesn't easily allow for that.
Any solutions/tips/suggestions?
I managed to hack something together (https://repl.it/#crunk1/Cerberus-root-types) subclassing Validator and overriding validate():
class V(cerberus.Validator):
def validate(self, document, schema=None, update=False, normalize=True):
doc = None
wrapped = False
if schema is not None:
root_schema = schema.get('__root__', None)
wrapped = root_schema is not None
if wrapped:
doc = {'__root__': document}
schema = {'__root__': root_schema}
elif self.schema is not None:
root_schema = self.schema.get('__root__', None)
wrapped = root_schema is not None
if wrapped:
doc = {'__root__': document}
schema = {'__root__': root_schema}
doc = doc or document
result = super(V, self).validate(doc, schema, update, normalize)
if wrapped:
# Unwrap.
self.document = self.document['__root__']
for e in self._errors:
e.schema_path = tuple(e.schema_path[1:])
if len(e.document_path) > 1:
e.document_path = tuple(e.document_path[1:])
return result
This allows you to treat the root document as a 'type': 'dict' or 'type': 'list'.
v = V()
d = {'1': '1', '2': '2'}
schema = {'__root__': {
'type': 'dict',
'keyschema': {'coerce': int},
'valueschema': {'coerce': int},
}}
print(v.validate(d, schema), v.document, v.errors)
l = ['1', '2']
schema = {'__root__': {
'type': 'list',
'schema': {'coerce': int},
}}
print(v.validate(l, schema), v.document, v.errors)
l = ['1', 'b']
print(v.validate(l, schema), v.document, v.errors)
Output:
True {1: 1, 2: 2} {}
True [1, 2] {}
False [1, 'b'] {1: ["field '1' cannot be coerced: invalid literal for int() with base 10: 'b'"]}

Scrapy with a nested array

I'm new to scrapy and would like to understand how to scrape on object for output into nested JSON. Right now, I'm producing JSON that looks like
[
{'a' : 1,
'b' : '2',
'c' : 3},
]
And I'd like it more like this:
[
{ 'a' : '1',
'_junk' : [
'b' : 2,
'c' : 3]},
]
---where I put some stuff in _junk subfields to post-process later.
The current code under the parser definition file in my scrapername.py is...
item['a'] = x
item['b'] = y
item['c'] = z
And it seemed like
item['a'] = x
item['_junk']['b'] = y
item['_junk']['c'] = z
---might fix that, but I'm getting an error about the _junk key:
File "/usr/local/lib/python2.7/dist-packages/scrapy/item.py", line 49, in __getitem__
return self._values[key]
exceptions.KeyError: '_junk'
Does this mean I need to change my items.py somehow? Currently I have:
class Website(Item):
a = Field()
_junk = Field()
b = Field()
c = Field()
You need to create the junk dictionary before storing items in it.
item['a'] = x
item['_junk'] = {}
item['_junk']['b'] = y
item['_junk']['c'] = z

Categories