I am currently creating my data frame with the below commands
getting data from url
...
devices = get_device_data.json()
device_data = devices["data"]
p_dev = pd.DataFrame(device_data)
However the json dictionary has 60 columns in, of which I only want two of them, is there a way to specify which columns to include when creating a data frame, or any way I can achieve the desired result?
Thanks
EDIT: some sample data with some columns deleted. I actually only want the id and hostname columns
[
{
"id": 474378238
"account": "https: //www.****.com/api/v2/accounts/38021/",
"bsid": None,
"carrier": "BOB",
"carrier_id": "BOB BOB",
"channel": None,
"connection_state": "connected",
"gsn": "356853050758871",
"homecarrid": "BOB",
"hostname": "BOB1345345",
"is_asset": True,
"is_gps_supported": True,
"is_upgrade_available": False,
"is_upgrade_supported": True,
"ltebandwidth": "20 MHz",
"mac": None,
"serial": "356853050758871",
"service_type": "LTE",
"ssid": None,
"summary": "connected",
"txchannel": "19667",
"type": "mdm",
"uid": "f5a8da8f",
"updated_at": "2018-08-17T11:19:57.019938+00:00",
"uptime": 86412.8558200002,
},
{
"id": 5674657356
"account": "https: //www.****.com/api/v2/accounts/38021/",
"bsid": None,
"carrier": "BOB",
"carrier_id": "BOB BOB",
"channel": None,
"connection_state": "connected",
"gsn": "356853050758871",
"homecarrid": "BOB",
"hostname": "BOB10765",
"is_asset": True,
"is_gps_supported": True,
"is_upgrade_available": False,
"is_upgrade_supported": True,
"ltebandwidth": "20 MHz",
"mac": None,
"serial": "356853050758871",
"service_type": "LTE",
"ssid": None,
"summary": "connected",
"txchannel": "19667",
"type": "mdm",
"uid": "f5a8da8f",
"updated_at": "2018-08-17T11:19:57.019938+00:00",
"uptime": 86412.8558200002,
},
{
"id": 5674657465
"account": "https: //www.****.com/api/v2/accounts/38021/",
"bsid": None,
"carrier": "BOB",
"carrier_id": "BOB BOB",
"channel": None,
"connection_state": "connected",
"gsn": "356853050758871",
"homecarrid": "BOB",
"hostname": "BOB10453453",
"is_asset": True,
"is_gps_supported": True,
"is_upgrade_available": False,
"is_upgrade_supported": True,
"ltebandwidth": "20 MHz",
"mac": None,
"serial": "356853050758871",
"service_type": "LTE",
"ssid": None,
"summary": "connected",
"txchannel": "19667",
"type": "mdm",
"uid": "f5a8da8f",
"updated_at": "2018-08-17T11:19:57.019938+00:00",
"uptime": 86412.8558200002,
},
{
"id": 9756756756
"account": "https: //www.****.com/api/v2/accounts/38021/",
"bsid": None,
"carrier": "BOB",
"carrier_id": "BOB BOB",
"channel": None,
"connection_state": "connected",
"gsn": "356853050758871",
"homecarrid": "BOB",
"hostname": "BOB100133",
"is_asset": True,
"is_gps_supported": True,
"is_upgrade_available": False,
"is_upgrade_supported": True,
"ltebandwidth": "20 MHz",
"mac": None,
"serial": "356853050758871",
"service_type": "LTE",
"ssid": None,
"summary": "connected",
"txchannel": "19667",
"type": "mdm",
"uid": "f5a8da8f",
"updated_at": "2018-08-17T11:19:57.019938+00:00",
"uptime": 86412.8558200002,
},
]
Use list comprehension with dict comprehension for filtering by colums names:
L is list of input data
device_data = [{k: v for k, v in x.items() if k in ['type','id']} for x in L]
print (device_data)
[{'id': 474378238, 'type': 'mdm'}, {'id': 5674657356, 'type': 'mdm'},
{'id': 5674657465, 'type': 'mdm'}, {'id': 9756756756, 'type': 'mdm'}]
df = pd.DataFrame(device_data)
print (df)
id type
0 474378238 mdm
1 5674657356 mdm
2 5674657465 mdm
3 9756756756 mdm
You can do it by deleting those columns from the data frame by the column header.
Example: p_dev.drop(['Column_1','Column_2','column_2'], axis = 1, inplace = True)
Another Way to do it to write only those column headers in a list which are needed and then overwrite the existing dataframe
Example:
col_list = ['Column_1', 'Column_3']
p_def=p_def[col_list]
Related
I have a JSON response that I want to convert to a Pandas dataframe for further analysis.
{
"isCurrentUserFollowing": False,
"id": 1,
"contactType": "Household",
"isPrivate": False,
"name": "Mrs Mattie Mahoney",
"informalName": "Mattie",
"description": None,
"website": None,
"maritalStatus": None,
"anniversaryMonth": None,
"anniversaryDay": None,
"anniversaryYear": None,
"mergedIntoContactId": None,
"address": {
"id": 1,
"label": "Primary",
"address1": "xyz",
"address2": None,
"city": "abc",
"state": "aa",
"postal": "11111",
"country": "United States",
"isPrimary": True,
"canBePrimary": False,
"startMonth": 1,
"startDay": 1,
"endMonth": 12,
"endDay": 31
},
"giftAskAmount": "$250",
"giftAskType": "Recurring Gift",
"lifeToDateGiving": "$225",
"yearToDateGiving": "$225",
"lastGiftAmount": "$225",
"lastGiftDate": "7/2/2021",
"contactIndividuals": [
{
"id": 1,
"contactId": 1,
"prefix": "Mrs",
"firstName": "Joan",
"middleName": None,
"lastName": "Arc",
"suffix": None,
"gender": "Female",
"isPrimary": True,
"canBePrimary": False,
"isSecondary": False,
"canBeSecondary": False,
"birthMonth": 1,
"birthDay": 1,
"birthYear": 1900,
"birthDate": "April 1",
"approximateAge": 100,
"isDeceased": False,
"passion": None,
"avatarUrl": "www.abc.com",
"contactMethods": [
{
"id": 1,
"type": "Home Email",
"value": "abc#hotmail.com",
"isOptedIn": False,
"isPrimary": True,
"canBePrimary": False
},
{
"id": 2,
"type": "Mobile Phone",
"value": "(111) 1111111",
"isOptedIn": False,
"isPrimary": True,
"canBePrimary": False
}
],
"createDateTimeUtc": "2021-04-15T17:41:31.2166667",
"modifiedDateTimeUtc": "2021-04-15T17:41:31.2166667",
"customFields": [],
"customCollections": []
},
{
"id": 2,
"contactId": 1,
"prefix": None,
"firstName": "John",
"middleName": None,
"lastName": "Smith",
"suffix": None,
"gender": "Female",
"isPrimary": False,
"canBePrimary": True,
"isSecondary": True,
"canBeSecondary": False,
"birthMonth": 1,
"birthDay": 11,
"birthYear": 1990,
"birthDate": "July 1",
"approximateAge": 100,
"isDeceased": False,
"passion": None,
"avatarUrl": "www.google.com",
"contactMethods": [
{
"id": 3,
"type": "Home Email",
"value": "mno#gmail.com",
"isOptedIn": False,
"isPrimary": True,
"canBePrimary": False
}
],
"createDateTimeUtc": "2021-04-15T17:41:31.2166667",
"modifiedDateTimeUtc": "2021-04-15T17:41:31.2166667",
"customFields": [],
"customCollections": []
}
],
"contactGiftsUrl": "/api/Gift/ByContact/1",
"contactPassthroughGiftsUrl": "/api/Gift/Passthrough/ByContact/1",
"contactPlannedGiftsUrl": "/api/PlannedGift/ByContact/1",
"contactRecurringGiftsUrl": "/api/RecurringGift/ByContact/1",
"contactImportantNotesUrl": "/api/ContactNote/Important/ByContact/1",
"contactNotesUrl": "/api/ContactNote/ByContact/1",
"contactTagsUrl": "/api/ContactTag/ByContact/1",
"contactRelationshipsUrl": "/api/Relationship/ByContact/1",
"primaryAvatarUrl": "www.apple.com",
"contactReferences": [],
"originSegmentId": None,
"originSegment": None,
"createDateTimeUtc": "2021-04-15T17:41:31.217",
"modifiedDateTimeUtc": "2021-04-15T17:41:31.217",
"customFields": [],
"customCollections": []
}
I am struggling with the code I need to write to get the relevant fields out
I am able to get some fields but I cannot get the gender and emails to be retrieved. Can someone assist ?
df = pd.DataFrame(
columns=[
"contactcustomerid",
"contactType",
"contactName",
"contactaddress",
"contactcity",
"contactstate",
"contactzip",
"contactcountry",
"giftAskAmount",
"giftAskType",
"lifetodateGiving",
"yeartoDateGiving",
"lastGiftAmount",
"lastGiftDate",
"contactGender",
"contactEmail",
]
)
if json_data:
row = {
"contactcustomerid": json_data.get("id"),
"contactType": json_data.get("contactType"),
"contactName": json_data.get("name"),
"contactaddress": json_data.get("address").get("address1"),
"contactcity": json_data.get("address").get("city"),
"contactstate": json_data.get("address").get("state"),
"contactzip": json_data.get("address").get("postal"),
"contactcountry": json_data.get("address").get("country"),
"giftAskAmount": json_data.get("giftAskAmount"),
"giftAskType": json_data.get("giftAskType"),
"lifetodateGiving": json_data.get("lifeToDateGiving"),
"yeartoDateGiving": json_data.get("yearToDateGiving"),
"lastGiftAmount": json_data.get("lastGiftAmount"),
"lastGiftDate": json_data.get("lastGiftDate"),
}
df = df.append(row, ignore_index="True")
df = df.fillna(0)
I'm trying to webscrape the data table from this site:
https://fl511.com/list/events/traffic?start=0&length=25&filters%5B0%5D%5Bi%5D=5&filters%5B0%5D%5Bs%5D=Incidents&order%5Bi%5D=8&order%5Bdir%5D=asc
But unfortunately, when I print out the table it doesn't return the tbody tag (which the information is stored in). All the other tags are shown. Is there a workaround to this?
url = Request(
url,
headers={'User-Agent': 'Mozilla/5.0'}
)
webpage = urlopen(url).read()
table = soup.find_all('table')
print(table)
The data is loaded from external source via Javascript. You can use this example how to load the data:
import json
import requests
data = {
"draw": 1,
"columns": [
{
"data": None,
"name": "",
"searchable": False,
"orderable": False,
"search": {"value": "", "regex": False},
"title": "",
"visible": True,
"isUtcDate": False,
"isCollection": False,
},
{
"data": "region",
"name": "region",
"searchable": False,
"orderable": True,
"search": {"value": "", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
{
"data": "county",
"name": "county",
"searchable": False,
"orderable": True,
"search": {"value": "", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
{
"data": "roadwayName",
"name": "roadwayName",
"searchable": False,
"orderable": True,
"search": {"value": "", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
{
"data": "direction",
"name": "direction",
"searchable": False,
"orderable": True,
"search": {"value": "", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
{
"data": "type",
"name": "type",
"searchable": False,
"orderable": True,
"search": {"value": "Incidents", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
{
"data": "severity",
"name": "severity",
"searchable": False,
"orderable": True,
"search": {"value": "", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
{
"data": "description",
"name": "description",
"searchable": False,
"orderable": False,
"search": {"value": "", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
{
"data": "startTime",
"name": "startTime",
"searchable": False,
"orderable": True,
"search": {"value": "", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
{
"data": "lastUpdated",
"name": "lastUpdated",
"searchable": False,
"orderable": True,
"search": {"value": "", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
{
"data": 10,
"name": "",
"searchable": False,
"orderable": False,
"search": {"value": "", "regex": False},
"isUtcDate": False,
"isCollection": False,
},
],
"order": [{"column": 8, "dir": "asc"}],
"start": 0,
"length": 25,
"search": {"value": "", "regex": False},
}
url = "https://fl511.com/List/GetData/traffic"
data = requests.post(url, json=data).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for i, d in enumerate(data["data"], 1):
print(i, d["description"])
print()
print("Records total:", data["recordsTotal"])
print("Records filtered:", data["recordsFiltered"])
Prints:
1 Crash in Highlands County on US-27 South, at Lake Josephine Dr. Right lane blocked. Last updated at 04:24 PM.
2 Emergency vehicles in Highlands County on US-27 North, at Lake Josephine Dr. Right lane blocked. Last updated at 04:25 PM.
3 Crash in Manatee County on US-41 North, at Pearl Ave. All lanes blocked. Last updated at 04:29 PM.
4 Crash in Polk County on I-4 East, beyond CR-557. 2 Left lanes blocked. Last updated at 04:32 PM.
5 Emergency vehicles in Manatee County on US-41 South, at Pearl Ave. Left lane blocked. Last updated at 04:35 PM.
6 Crash in Miami-Dade County on I-195 East, beyond North Miami Ave. Right lane blocked. Last updated at 05:03 PM.
7 Crash in Santa Rosa County on I-10 East, ramp to Exit 22 (SR-281/Avalon Blvd). Right shoulder blocked. Last updated at 05:05 PM.
8 Emergency vehicles in Santa Rosa County on I-10 West, at Exit 22 (SR-281/Avalon Blvd). Left shoulder blocked. Last updated at 05:02 PM.
9 Multi-vehicle crash in Duval County on I-295 E South, before Between Atlantic Blvd/St Johns Bluff Rd. Left shoulder blocked. Last updated at 05:30 PM.
Records total: 93
Records filtered: 9
I am unable to create a dictionary whose structure is like :
"aggs": {
"4": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"time_zone": "Asia/Kolkata",
"min_doc_count": 1,
},
"aggs": {
"5": {
"range": {
"field": "system.core.idle.pct",
"ranges": [{"from": 0, "to": 0.001}, {"from": 0.001, "to": 1}],
"keyed": true,
},
"aggs": {
"2": {"max": {"field": "system.core.system.pct"}},
"3": {"max": {"field": "system.core.idle.pct"}},
},
}
},
}
}
Mostly I am getting the issue in creating aggs:5 inside aggs:4 and then looping it. I need to do it for multiple aggs. the number of agg id can be upto 1000 also.
I am trying to derive it from the dictionary:
"aggs": [
{"id": "1", "enabled": true, "type": "count", "schema": "metric", "params": {}},
{
"id": "2",
"enabled": true,
"type": "max",
"schema": "metric",
"params": {"field": "system.core.system.pct", "customLabel": "system"},
},
{
"id": "3",
"enabled": true,
"type": "max",
"schema": "metric",
"params": {"field": "system.core.idle.pct", "customLabel": "idle"},
},
{
"id": "4",
"enabled": true,
"type": "date_histogram",
"schema": "bucket",
"params": {
"field": "#timestamp",
"interval": "m",
"customInterval": "2h",
"min_doc_count": 1,
"extended_bounds": {},
},
},
{
"id": "5",
"enabled": true,
"type": "range",
"schema": "bucket",
"params": {
"field": "system.core.idle.pct",
"ranges": [{"from": 0, "to": 0.001}, {"from": 0.001, "to": 1}],
},
},
]
Can anyone show the code how to execute it. Basically I am failing in creating nested dictionary.
It looks like you are using JSON data.
You can see this link https://realpython.com/python-json/.
You can use the inbuilt JSON lib to covert the whole thing in a dict
Basically something like this
json_string = """
{
"researcher": {
"name": "Ford Prefect",
"species": "Betelgeusian",
"relatives": [
{
"name": "Zaphod Beeblebrox",
"species": "Betelgeusian"
}
]
}
}
"""
data = json.loads(json_string)
add whole structure in {}, and replace true to True.
{"aggs": {
"4": {
"date_histogram": {
"field": "#timestamp",
"interval": "1m",
"time_zone": "Asia/Kolkata",
"min_doc_count": 1,
},
"aggs": {
"5": {
"range": {
"field": "system.core.idle.pct",
"ranges": [{"from": 0, "to": 0.001}, {"from": 0.001, "to": 1}],
"keyed": True,
},
"aggs": {
"2": {"max": {"field": "system.core.system.pct"}},
"3": {"max": {"field": "system.core.idle.pct"}},
},
}
},
}
}}
{'aggs': {'4': {'date_histogram': {'field': '#timestamp',
'interval': '1m',
'time_zone': 'Asia/Kolkata',
'min_doc_count': 1},
'aggs': {'5': {'range': {'field': 'system.core.idle.pct',
'ranges': [{'from': 0, 'to': 0.001}, {'from': 0.001, 'to': 1}],
'keyed': True},
'aggs': {'2': {'max': {'field': 'system.core.system.pct'}},
'3': {'max': {'field': 'system.core.idle.pct'}}}}}}}}
I am getting Data From DB but I want to return Few specific Columns to My Get API call.
I am Getting Multiple Dictionaries.
driver_data = MasterDriver.query.filter_by(Driver_Status_id=2, is_driver_verified=1)
return drivers_schema.jsonify(driver_data)
[
{
"email_id": "abjhgfjhgc#xyz.com",
"first_name": "manoj",
"isActive": null,
"last_name": "sayi",
"license": null,
"mobile_number": "+917780187505",
},
{
"email_id": "abjhgfjhgc#xyz.com",
"first_name": "manoj",
"isActive": null,
"last_name": "sayi",
"license": null,
"mobile_number": "+917780187505",
},
{
"email_id": "abjhgfjhgc#xyz.com",
"first_name": "manoj",
"isActive": null,
"last_name": "sayi",
"license": null,
"mobile_number": "+917780187505",
}
]
null is not existent in Python, you might be looking for None.
What you are trying to do is a very basic for x in array operation.
I suggest beginning with the basics of Python to be able to solve your problem.
Nevertheless, here's a working solution to retrieve the elements in your array
arr = [
{
"email_id": "abjhgfjhgc#xyz.com",
"first_name": "manoj",
"isActive": None,
"last_name": "sayi",
"license": None,
"mobile_number": "+917780187505",
},
{
"email_id": "abjhgfjhgc#xyz.com",
"first_name": "manoj",
"isActive": None,
"last_name": "sayi",
"license": None,
"mobile_number": "+917780187505",
},
{
"email_id": "abjhgfjhgc#xyz.com",
"first_name": "manoj",
"isActive": None,
"last_name": "sayi",
"license": None,
"mobile_number": "+917780187505",
}
]
for elem in arr:
print(elem["email_id"])
print(elem["first_name"])
print(elem["isActive"])
print(elem["last_name"])
print(elem["license"])
print(elem["mobile_number"])
You can have a look at repl.it to execute it.
I have the following json document:
{
"id": "5c26321bd8f4113d43b91141",
"idMemberCreator": "5b203bc7e47d817a8138bc37",
"data": {
"list": {
"name": "Sorji for QA",
"id": "5b0a2543b89acdbdb85f7b42"
},
"board": {
"shortLink": "iyCzZ5jx",
"name": "FlicksIO",
"id": "5b0a251f68a9e74b8ec3b3ac"
},
"card": {
"shortLink": "vOt2vO7v",
"idShort": 92,
"name": "New column in main for Storefront provider correlation.",
"id": "5b9c0023533f7c26424ea4ed",
"closed": true
},
"old": {
"closed": false
}
},
"type": "updateCard",
"date": "2018-12-28T14:24:27.455Z",
"limits": {},
"memberCreator": {
"id": "5b203bc7e47d817a8138bc37",
"avatarHash": "73bfa48c76c3c92615fe89ff79a6c5ae",
"avatarUrl": "https://trello-avatars.s3.amazonaws.com/73bfa48f79a6c5ae",
"fullName": "Marie Bond",
"idMemberReferrer": null,
"initials": "MB",
"username": "mb"
}
}
I would like to expand this out to be a single level with dot notation. That is, it should look like:
{
"id": "5c26321bd8f4113d43b91141",
"idMemberCreator": "5b203bc7e47d817a8138bc37",
"data.list.name": "Sorji for QA",
"data.list.id": "5b0a2543b89acdbdb85f7b42"
"data.board.shortLink": "iyCzZ5jx",
"data.board.name": "FlicksIO",
"data.board.id": "5b0a251f68a9e74b8ec3b3ac"
"data.card.shortLink": "vOt2vO7v",
"data.card.idShort": 92,
"data.card.name": "New column in main for Storefront provider correlation.",
"data.card.id": "5b9c0023533f7c26424ea4ed",
"data.card.closed": true
"data.old.closed": false
"type": "updateCard",
"date": "2018-12-28T14:24:27.455Z",
"limits": {},
"memberCreator.id": "5b203bc7e47d817a8138bc37",
"memberCreator.avatarHash": "73bfa48c76c3c92615fe89ff79a6c5ae",
"memberCreator.avatarUrl": "https://trello-avatars.s3.amazonaws.com/73bfa48f79a6c5ae",
"memberCreator.fullName": "Marie Bond",
"memberCreator.idMemberReferrer": null,
"memberCreator.initials": "MB",
"memberCreator.username": "mb"
}
Would it be possible to do this with a generator object? I've been working a lot on recursion today, and have been trying to move from while loops to using generator objects and yields, etc.
You can keep a parameter in the signature of the recursive function to store the paths:
data = {'id': '5c26321bd8f4113d43b91141', 'idMemberCreator': '5b203bc7e47d817a8138bc37', 'data': {'list': {'name': 'Sorji for QA', 'id': '5b0a2543b89acdbdb85f7b42'}, 'board': {'shortLink': 'iyCzZ5jx', 'name': 'FlicksIO', 'id': '5b0a251f68a9e74b8ec3b3ac'}, 'card': {'shortLink': 'vOt2vO7v', 'idShort': 92, 'name': 'New column in main for Storefront provider correlation.', 'id': '5b9c0023533f7c26424ea4ed', 'closed': True}, 'old': {'closed': False}}, 'type': 'updateCard', 'date': '2018-12-28T14:24:27.455Z', 'limits': {}, 'memberCreator': {'id': '5b203bc7e47d817a8138bc37', 'avatarHash': '73bfa48c76c3c92615fe89ff79a6c5ae', 'avatarUrl': 'https://trello-avatars.s3.amazonaws.com/73bfa48f79a6c5ae', 'fullName': 'Marie Bond', 'idMemberReferrer': None, 'initials': 'MB', 'username': 'mb'}}
def dot_paths(d, _paths = []):
for a, b in d.items():
if not b or not isinstance(b, dict):
yield ['.'.join(_paths+[a]), b]
else:
yield from dot_paths(b, _paths+[a])
import json
print(json.dumps(dict(dot_paths(data)), indent=4))
Output:
{
"id": "5c26321bd8f4113d43b91141",
"idMemberCreator": "5b203bc7e47d817a8138bc37",
"data.list.name": "Sorji for QA",
"data.list.id": "5b0a2543b89acdbdb85f7b42",
"data.board.shortLink": "iyCzZ5jx",
"data.board.name": "FlicksIO",
"data.board.id": "5b0a251f68a9e74b8ec3b3ac",
"data.card.shortLink": "vOt2vO7v",
"data.card.idShort": 92,
"data.card.name": "New column in main for Storefront provider correlation.",
"data.card.id": "5b9c0023533f7c26424ea4ed",
"data.card.closed": true,
"data.old.closed": false,
"type": "updateCard",
"date": "2018-12-28T14:24:27.455Z",
"limits": {},
"memberCreator.id": "5b203bc7e47d817a8138bc37",
"memberCreator.avatarHash": "73bfa48c76c3c92615fe89ff79a6c5ae",
"memberCreator.avatarUrl": "https://trello-avatars.s3.amazonaws.com/73bfa48f79a6c5ae",
"memberCreator.fullName": "Marie Bond",
"memberCreator.idMemberReferrer": null,
"memberCreator.initials": "MB",
"memberCreator.username": "mb"
}