I need some assistance on a somewhat simple issue.
I'm trying to convert the content of a json file from this:
{ "Timestamp": "Timestamp",
"name": "SVCHOST.EXE",
"icon": "binary_icon.png",
"Process": SVCHOST.EXE,
"Pid": "876",
"PPID": "500"],
"children": [Process details])
},
{ "Timestamp":"Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "500",
"PPID": "4"],
"children": [Process details])
},
{ "Timestamp":"Timestamp",
"name": "SYSTEM",
"icon": "binary_icon.png",
"Process": "SYSTEM",
"Pid": "4",
"PPID": "0"],
"children": [Process details])
}
To this:
{
"name": "Root",
"children": [
{
"name": "4",
"children": [
{
"name": "500",
"children": [
{
"name": "876",
"children": []
}
]
}
]
}
}
To create a node tree graph in the end.
But after a lot of trial and error, and still not close to the output I need. I'm asking for some pointer, tips or tricks.
Any help is much appreciated.
Thanks,
Here is my most recent attempt.
import json
links = ({
"Timestamp": "Timestamp",
"name": "SVCHOST.EXE",
"icon": "binary_icon.png",
"Process": "SVCHOST.EXE",
"Pid": "876",
"PPID": "500",
"children": "Process_details"
},
{
"Timestamp":"Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "500",
"PPID": "4",
"children": "Process_details"
},
{
"Timestamp":"Timestamp",
"name": "SYSTEM",
"icon": "binary_icon.png",
"Process": "SYSTEM",
"Pid": "4",
"PPID": "0",
"children": "Process_details"
})
parent_proc_node = {}
root = {'name': 'Root', 'children': []}
for item in procs:
parent_node = parent_proc_node.get(item['Pid'])
if not parent_node:
parent_proc_node[item['Pid']] = parent_node = {'name': item['PPID']}
root['children'].append(parent_node)
parent_proc_node[item['PPID']] = child_node = {'name': item['Pid']}
parent_node.setdefault('children', []).append(child_node)
print json.dumps(root, indent=4)
Current output:
{
"name": "Root",
"children": [
{
"name": "500",
"children": [
{
"name": "876",
"children": [
{
"name": "500",
"children": [
{
"name": "4"
}
]
}
]
}
]
}
}
The output is now what I want, but im still not able to correctly match parent process with children.
What am I doing wrong?
The correct output would be like this:
{
"name": "Root",
"children": [
{
"name": "4",
"children": [
{
"name": "500",
"children": [
{
"name": "876",
"children": [
{
"name": ""
}
]
}
]
}
]
}
}
Here's some code that does what I think you want. It processes the links (which I turned into a list, since JSON doesn't have tuples), converting it into the nested structure that you show as the final correct output. I've also added a couple of new records so that some parents have multiple children.
The trick is to first create a dictionary (ids) that captures the parent-child relationship of the process IDs.
import json
links = [
{
"Timestamp": "Timestamp",
"name": "SVCHOST.EXE",
"icon": "binary_icon.png",
"Process": "SVCHOST.EXE",
"Pid": "876",
"PPID": "500",
"children": "Process_details"
},
{
"Timestamp": "Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "500",
"PPID": "4",
"children": "Process_details"
},
{
"Timestamp": "Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "510",
"PPID": "4",
"children": "Process_details"
},
{
"Timestamp": "Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "600",
"PPID": "510",
"children": "Process_details"
},
{
"Timestamp": "Timestamp",
"name": "SYSTEM",
"icon": "binary_icon.png",
"Process": "SYSTEM",
"Pid": "4",
"PPID": "0",
"children": "Process_details"
}
]
# Create a dict linking each pid to its parent
ids = {}
for d in links:
# Use "0" as the ppid if "PPID" field is an empty string
ppid, pid = d["PPID"] or "0", d["Pid"]
ids.setdefault(ppid, []).append(pid)
print(ids)
# Nest the data for each pid in its parent's dict
def insert(lst, ppid, name):
if ppid in ids:
children = []
lst.append({"name": name, "children": children})
for pid in ids[ppid]:
insert(children, pid, pid)
else:
children = [{"name": ""}]
lst.append({"name": name, "children": children})
nested = []
insert(nested, "0", "Root")
print(json.dumps(nested[0], indent=4))
output
{'500': ['876'], '4': ['500', '510'], '510': ['600'], '0': ['4']}
{
"name": "Root",
"children": [
{
"name": "4",
"children": [
{
"name": "500",
"children": [
{
"name": "876",
"children": [
{
"name": ""
}
]
}
]
},
{
"name": "510",
"children": [
{
"name": "600",
"children": [
{
"name": ""
}
]
}
]
}
]
}
]
}
#PM 2Ring Sorry please disregard the PPID 0 remarks. missing execution handling at my end. :)
Your excample works perfectly, for parents with child pids. however if a PID has no parent it is not added to the root node.
procs = [{
"Timestamp": "Timestamp",
"name": "SVCHOST.EXE",
"icon": "binary_icon.png",
"Process": "SVCHOST.EXE",
"Pid": "876",
"PPID": "500",
"children": "Process_details"
},
{
"Timestamp":"Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "500",
"PPID": "4",
"children": "Process_details"
},
{
"Timestamp":"Timestamp",
"name": "SYSTEM",
"icon": "binary_icon.png",
"Process": "SYSTEM",
"Pid": "4",
"PPID": "0",
"children": "Process_details"
}
,
{
"Timestamp":"Timestamp",
"name": "ROUGEPROC",
"icon": "binary_icon.png",
"Process": "ROUGEPROC",
"Pid": "4322",
"PPID": "",
"children": "Process_details"
}]
# Create a dict linking each pid to its parent
ids = {}
for d in procs:
ppid, pid = d["PPID"], d["Pid"]
ids.setdefault(ppid, []).append(pid)
print(ids)
# Nest the data for each pid in its parent's dict
def insert(lst, ppid, name):
if ppid in ids:
children = []
lst.append({"name": name, "children": children, "icon": "binary_icon.png"})
for pid in ids[ppid]:
insert(children, pid, pid)
else:
children = []
lst.append({"name": name, "children": children, "icon": "binary_icon.png"})
nested = []
insert(nested, "0", "PPID")
proc_report = {"name" :"HOSTNAME",
"icon": "win_os_icon.png",
"children": nested[0]["children"]
}
print(json.dumps(proc_report, indent=4))
Output:
{'': ['4322'], '0': ['4'], '4': ['500'], '500': ['876']}
{
"children": [
{
"icon": "binary_icon.png",
"name": "4",
"children": [
{
"icon": "binary_icon.png",
"name": "500",
"children": [
{
"icon": "binary_icon.png",
"name": "876",
"children": []
}
]
}
]
}
],
"name": "HOSTNAME",
"icon": "win_os_icon.png"
}
Related
I'm wondering if you could help me with filling jsons with their original filenames.
Here is a sample of json:
jsv is a list of jsons (the first main key is number of document (document_0, document_1 ...)
jsv =
[
{
{
"document_0":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"09-12-2022",
"source_filename":"None",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2020-11-30",
"comment":"None",
"confidence":97,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
},
{
"document_1":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"09-12-2022",
"source_filename":"None",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2020-11-30",
"comment":"None",
"confidence":97,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
}
}
]
And inside of this json there is a key: source_filename which I want to update with real name of json file name
my folder with files as an example:
'11111.pdf.json',
'11112.pdf.json',
'11113.pdf.json',
'11114.pdf.json',
'11115.pdf.json'
What I want to achieve:
jsv =
[
{
{
"document_0":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"09-12-2022",
"source_filename":"11111.pdf.json",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2222-22-22",
"comment":"None",
"confidence":22,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
},
{
"document_1":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"22-22-2222",
"source_filename":"11111.pdf.json",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2222-11-22",
"comment":"None",
"confidence":22,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
}
}
]
document_0 and document_1 are with the same filename
what I've managed to get:
dir_name = 'path_name'
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(dir_name) if isfile(join(dir_name, f))]
only_files which is a list of filenames of my jsons.
Now I was thinking to maybe update somehow my jsv with it in a loop?
But I'm also looking for a method which will be very efficient due to large amount of data I have to process
EDIT:
I've managed to do it with a for loop, but maybe there is more effective way:
for i in range(len(jsv)):
if (type(jsv[i]) == dict):
jsv[i]["document_0"].update({"source_filename": onlyfiles[i]})
else:
print(onlyfiles[i])
If your jsv is:
jsv = [
{
"document_0": {
"id": 111,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "None",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
},
],
}
},
{
"document_1": {
"id": 111,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "None",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
},
],
},
},
]
In Python, you can do something like this:
arq = ['11111.pdf.json', '11112.pdf.json']
if len(arq) == len(jsv):
for i, json in enumerate(jsv):
for key in enumerate(json.keys()):
json[key[1]]['source_filename'] = arq[i]
Need to check if the length of files list is the same of the jsv list!
result this jsv:
[
{
"document_0": {
"id": 111,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "11111.pdf.json",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
}
],
}
},
{
"document_1": {
"id": 222,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "11112.pdf.json",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
}
],
}
},
]
I have a JSON object:
{
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
I want to get the result as a new JSON, but without using pandas (and all those explode, flatten and normalize functions...). Is there any option to get this structure without using pandas or having an Out of memory issue?
The output should be:
{ "id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{ "id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24",
},
{ "id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{ "id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
},
You can simply loop over the list associated with "geography" and build new dictionaries that you will add to a newly created list:
dict_in = {
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
import json
rec_out = []
for obj in dict_in["data"]["geography"]:
for prop in obj["properties"]:
dict_out = {
"id": obj["id"],
"state": obj["state"]
}
dict_out.update(prop)
rec_out.append(dict_out)
print(json.dumps(rec_out, indent=4))
Output:
[
{
"id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{
"id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
}
]
I have a nested dictionary like below:
[
{
"name": "A",
"flag": "folder",
"children": [
{
"name": "A1",
"flag": "folder",
"children": [
{
"name": "A1x",
"flag": "file",
"children": []
},
{
"name": "A1y",
"flag": "file",
"children": []
}
]
}
]
}
]
From this dict, I would like to generate a dataframe as below:
Is there any nice way to make this?
With the following nested dictionary which expands on yours for demonstration purposes:
data = [
{
"name": "A",
"flag": "folder",
"children": [
{
"name": "A1",
"flag": "folder",
"children": [
{
"name": "A1x",
"flag": "file",
"children": [{"name": "A1xx", "flag": "file", "children": []}],
},
{
"name": "A1y",
"flag": "file",
"children": [{"name": "A1yy", "flag": "file", "children": []}],
},
],
},
{
"name": "A2",
"flag": "folder",
"children": [
{
"name": "A2x",
"flag": "file",
"children": [{"name": "A2xx", "flag": "file", "children": []}],
},
{
"name": "A2y",
"flag": "file",
"children": [{"name": "A2yy", "flag": "file", "children": []}],
},
],
},
],
},
{
"name": "B",
"flag": "folder",
"children": [
{
"name": "B1",
"flag": "folder",
"children": [
{
"name": "B1x",
"flag": "file",
"children": [{"name": "B1xx", "flag": "file", "children": []}],
},
{
"name": "B1y",
"flag": "file",
"children": [{"name": "B1yy", "flag": "file", "children": []}],
},
],
},
{
"name": "B2",
"flag": "folder",
"children": [
{
"name": "B2x",
"flag": "file",
"children": [{"name": "B2xx", "flag": "file", "children": []}],
},
{
"name": "B2y",
"flag": "file",
"children": [{"name": "B2yy", "flag": "file", "children": []}],
},
],
},
],
},
]
Here is one way to do it by defining two short helper functions:
import pandas as pd
def traverse(data, new_data=None):
"""Recursive function to go through dict of values.
Args:
data: target dict.
new_data: container. Defaults to None.
Returns:
Flatten data.
"""
new_data = new_data if new_data else [[]]
new_data[-1].append(data["name"])
new_data[-1].append(data["flag"])
for child in data["children"]:
traverse(child, new_data)
new_data.append([])
return new_data
def make_rows(flat_data):
"""Custom function to shape data.
Args:
flat_data: target data.
Returns:
Dataframe.
"""
rows = [[]]
for item in flat_data:
if item:
rows[-1] += item
else:
rows.append([None for _ in range(int(len(rows[-1]) / 2))])
return pd.DataFrame(rows).dropna(how="all").fillna(method="ffill")
And then:
df = pd.concat([make_rows(traverse(item)) for item in data]).reset_index(drop=True)
df.columns = pd.MultiIndex.from_product(
[[f"Level {i}" for i in range(int(df.shape[1] / 2))], ["name", "flag"]]
)
print(df)
# Output
Level 0 Level 1 Level 2 Level 3
name flag name flag name flag name flag
0 A folder A1 folder A1x file A1xx file
1 A folder A1 folder A1y file A1yy file
2 A folder A2 folder A2x file A2xx file
3 A folder A2 folder A2y file A2yy file
4 B folder B1 folder B1x file B1xx file
5 B folder B1 folder B1y file B1yy file
6 B folder B2 folder B2x file B2xx file
7 B folder B2 folder B2y file B2yy file
i have stored the data in arangodb in the following format:
{"data": [
{
"content": "maindb",
"type": "string",
"name": "db_name",
"key": "1745085839"
},
{
"type": "id",
"name": "rel",
"content": "1745085840",
"key": "1745085839"
},
{
"content": "user",
"type": "string",
"name": "rel_name",
"key": "1745085840"
},
{
"type": "id",
"name": "tuple",
"content": "174508584001",
"key": "1745085840"
},
{
"type": "id",
"name": "tuple",
"content": "174508584002",
"key": "1745085840"
},
{
"type": "id",
"name": "tuple",
"content": "174508584003",
"key": "1745085840"
},
{
"type": "id",
"name": "tuple",
"content": "174508584004",
"key": "1745085840"
},
{
"type": "id",
"name": "tuple",
"content": "174508584005",
"key": "1745085840"
},
{
"type": "id",
"name": "tuple",
"content": "174508584006",
"key": "1745085840"
},
{
"type": "id",
"name": "tuple",
"content": "174508584007",
"key": "1745085840"
},
{
"content": "dspclient",
"type": "varchar",
"name": "username",
"key": "174508584001"
},
{
"content": "12345",
"type": "varchar",
"name": "password",
"key": "174508584001"
},
{
"content": "12345",
"type": "varchar",
"name": "cpassword",
"key": "174508584001"
},
{
"content": "n",
"type": "varchar",
"name": "PostgreSQL",
"key": "174508584001"
},
{
"content": "n",
"name": "IBMDB2",
"type": "varchar",
"key": "174508584001"
},
{
"content": "n",
"name": "MySQL",
"type": "varchar",
"key": "174508584001"
},
{
"content": "n",
"type": "varchar",
"name": "SQLServer",
"key": "174508584001"
},
{
"content": "n",
"name": "Hadoop",
"type": "varchar",
"key": "174508584001"
},
{
"content": "None",
"name": "dir1",
"type": "varchar",
"key": "174508584001"
},
{
"content": "None",
"name": "dir2",
"type": "varchar",
"key": "174508584001"
},
{
"content": "None",
"name": "dir3",
"type": "varchar",
"key": "174508584001"
},
{
"content": "None",
"name": "dir4",
"type": "varchar",
"key": "174508584001"
},
{
"type": "inet",
"name": "ipaddr",
"content": "1921680103",
"key": "174508584001"
},
{
"content": "y",
"name": "status",
"type": "varchar",
"key": "174508584001"
},
{
"content": "None",
"type": "varchar",
"name": "logintime",
"key": "174508584001"
},
{
"content": "None",
"type": "varchar",
"name": "logindate",
"key": "174508584001"
},
{
"content": "None",
"type": "varchar",
"name": "logouttime",
"key": "174508584001"
},
{
"content": "client",
"type": "varchar",
"name": "user_type",
"key": "174508584001"
},
{
"content": "royal",
"type": "varchar",
"name": "username",
"key": "174508584002"
},
{
"content": "12345",
"type": "varchar",
"name": "password",
"key": "174508584002"
},
{
"content": "12345",
"type": "varchar",
"name": "cpassword",
"key": "174508584002"
},
{
"content": "n",
"type": "varchar",
"name": "PostgreSQL",
"key": "174508584002"
},
{
"content": "n",
"name": "IBMDB2",
"type": "varchar",
"key": "174508584002"
},
{
"content": "n",
"name": "MySQL",
"type": "varchar",
"key": "174508584002"
},
{
"content": "n",
"type": "varchar",
"name": "SQLServer",
"key": "174508584002"
},
{
"content": "n",
"name": "Hadoop",
"type": "varchar",
"key": "174508584002"
},
{
"content": "None",
"name": "dir1",
"type": "varchar",
"key": "174508584002"
},
{
"content": "None",
"name": "dir2",
"type": "varchar",
"key": "174508584002"
},
{
"content": "None",
"name": "dir3",
"type": "varchar",
"key": "174508584002"
},
{
"content": "None",
"name": "dir4",
"type": "varchar",
"key": "174508584002"
},
{
"type": "inet",
"name": "ipaddr",
"content": "1921680105",
"key": "174508584002"
},
{
"content": "y",
"name": "status",
"type": "varchar",
"key": "174508584002"
},
{
"content": "190835899000",
"type": "varchar",
"name": "logintime",
"key": "174508584002"
},
{
"content": "20151002",
"type": "varchar",
"name": "logindate",
"key": "174508584002"
},
{
"content": "None",
"type": "varchar",
"name": "logouttime",
"key": "174508584002"
},
{
"content": "client",
"type": "varchar",
"name": "user_type",
"key": "174508584002"
},
{
"content": "abc",
"type": "varchar",
"name": "username",
"key": "174508584003"
},
{
"content": "12345",
"type": "varchar",
"name": "password",
"key": "174508584003"
},
{
"content": "12345",
"type": "varchar",
"name": "cpassword",
"key": "174508584003"
},
{
"content": "n",
"type": "varchar",
"name": "PostgreSQL",
"key": "174508584003"
},
{
"content": "n",
"name": "IBMDB2",
"type": "varchar",
"key": "174508584003"
}]}
In order to perform fulltext search, I have created an index on content attribute by using the syntax from a python script:
c.DSP.ensureFulltextIndex("content");
Where, c is database, and DSP is the collection name. Now, I am trying to perform a search operation in the above data set by using the syntax:
FOR doc IN FULLTEXT(DSP, "content", "username") RETURN doc
Then, an error occure:
[1571] in function 'FULLTEXT()': no suitable fulltext index found for fulltext query on 'DSP' (while executing)
Please tell me the problem, and also tell me what will be the syntax when i will try this query with a python script.
Thanks...
Working with the 10 minutes tutorial and the driver documentation
I got it working like this:
from pyArango.connection import *
c = Connection()
db = c.createDatabase(name = "testdb")
DSP= db.createCollection(name = "DSP")
DSP.ensureFulltextIndex(fields=["content"])
doc = DSP.createDocument({"content": "test bla"})
doc.save()
print db.AQLQuery('''FOR doc IN FULLTEXT(DSP, "content", "bla") RETURN doc''', 10)
Resulting in:
[{u'_key': u'1241175138503', u'content': u'test bla', u'_rev': u'1241175138503', u'_id': u'DSP/1241175138503'}]
I've used arangosh to revalidate the steps from the python prompt:
arangosh> db._useDatabase("testdb")
arangosh [testdb]> db.DSP.getIndexes()
[
{
"id" : "DSP/0",
"type" : "primary",
"fields" : [
"_key"
],
"selectivityEstimate" : 1,
"unique" : true,
"sparse" : false
},
{
"id" : "DSP/1241140928711",
"type" : "hash",
"fields" : [
"content"
],
"selectivityEstimate" : 1,
"unique" : false,
"sparse" : true
},
{
"id" : "DSP/1241142960327",
"type" : "fulltext",
"fields" : [
"content"
],
"unique" : false,
"sparse" : true,
"minLength" : 2
}
]
arangosh [testdb]> db.testdb.toArray()
[
{
"content" : "test bla",
"_id" : "DSP/1241175138503",
"_rev" : "1241175138503",
"_key" : "1241175138503"
}
]
db._query('FOR doc IN FULLTEXT(DSP, "content", "bla") RETURN doc')
I'm trying to convert a filesystem tree to json using python.
Imagine that i have the following tree :
plans/
|-- p1/
| |-- p1_1.pdf
| |-- p1_2.pdf
| `-- test/
| `-- test.jpg
|-- p2/
| |-- p2_1.pdf
| |-- p2_2.pdf
| `-- test2/
|
`-- resume.pdf
I would like to have a json output like that :
[
{
"name": "p1",
"type": "folder",
"path": "/plans/p1",
"tag": "org",
"children": [
{
"name": "p1_1.pdf",
"type": "file",
"path": "/plans/p1/p1_1.pdf",
"tag": "org"
},
{
"name": "p1_2.pdf",
"type": "file",
"path": "/plans/p1/p1_2.pdf",
"tag": "org"
},
{
"name": "test",
"type": "folder",
"path": "/plans/p1/test",
"tag": "org",
"children": [
{
"name": "test.jpg",
"type": "file",
"path": "/plans/p1/test/test.jpg",
"tag": "org"
}
]
}
]
},
{
"name": "p2",
"type": "folder",
"path": "/plans/p2",
"tag": "org",
"children": [
{
"name": "p2_1.pdf",
"type": "file",
"path": "/plans/p2/p2_1.pdf",
"tag": "org"
},
{
"name": "p2_2.pdf",
"type": "file",
"path": "/plans/p2/p2_2.pdf",
"tag": "org"
},
{
"name": "test2",
"type": "folder",
"path": "/plans/p2/test2",
"tag": "org",
"children": [
]
}
]
},
{
"name": "resume.pdf",
"type": "file",
"path": "/plans/resume.pdf",
"tag": "org"
}
]
I'm currently using the os.walk() python function to go through the tree and creating lists of dicts to generate a "dumpable" list using json.dumps() but i didn't know how to do it recursively.
Here a quick code draft:
def tree_to_json(rootdir):
main = []
for path, dirs, files in os.walk(rootdir):
for curdir in dirs:
child = []
new_dir = {"name": curdir,
"type": "folder",
"path": path + os.sep + curdir,
"children": child}
main.append(new_dir)
for curfile in files:
new_file = {"name": curfile,
"type": "file",
"path": path + os.sep + curfile}
main.append(new_file)
return json.dumps(main, sort_keys=True, indent=2, separators=(',', ': '))
As anything in programming there are many ways to solve. Here is one solution:
import json
from os import walk, path
def file_to_dict(fpath):
return {
'name': path.basename(fpath),
'type': 'file',
'path': fpath,
'tag': 'org',
}
def folder_to_dict(rootpath):
return {
'name': path.basename(rootpath),
'type': 'folder',
'path': rootpath,
'tag': 'org',
'children': [],
}
def tree_to_dict(rootpath):
root_dict = folder_to_dict(rootpath)
root, folders, files = walk(rootpath).next()
root_dict['children'] = [file_to_dict(path.sep.join([root, fpath])) for fpath in files]
root_dict['children'] += [tree_to_dict(path.sep.join([root, folder])) for folder in folders]
return root_dict
def tree_to_json(rootdir, pretty_print=True):
root, folders, files = walk(rootdir).next()
root_dict = [tree_to_dict(path.sep.join([root, folder])) for folder in folders]
root_dict += [file_to_dict(path.sep.join([root, fpath])) for fpath in files]
if pretty_print:
js = json.dumps(root_dict, indent=4, encoding='utf-8')
else:
js = json.dumps(root_dict, encoding='utf-8')
return js
print tree_to_json('/tmp/tree')
And here is the output:
[
{
"path": "/tmp/tree/p1",
"tag": "org",
"type": "folder",
"name": "p1",
"children": [
{
"path": "/tmp/tree/p1/p1_1.pdf",
"tag": "org",
"type": "file",
"name": "p1_1.pdf"
},
{
"path": "/tmp/tree/p1/p1_2.pdf",
"tag": "org",
"type": "file",
"name": "p1_2.pdf"
},
{
"path": "/tmp/tree/p1/test",
"tag": "org",
"type": "folder",
"name": "test",
"children": [
{
"path": "/tmp/tree/p1/test/test.jpg",
"tag": "org",
"type": "file",
"name": "test.jpg"
}
]
}
]
},
{
"path": "/tmp/tree/p2",
"tag": "org",
"type": "folder",
"name": "p2",
"children": [
{
"path": "/tmp/tree/p2/p2_1.pdf",
"tag": "org",
"type": "file",
"name": "p2_1.pdf"
},
{
"path": "/tmp/tree/p2/p2_2.pdf",
"tag": "org",
"type": "file",
"name": "p2_2.pdf"
},
{
"path": "/tmp/tree/p2/test2",
"tag": "org",
"type": "folder",
"name": "test2",
"children": []
}
]
},
{
"path": "/tmp/tree/resume.pdf",
"tag": "org",
"type": "file",
"name": "resume.pdf"
}
]