add an array of linked document _ids to couchdb documents in python - python

I want to add a links property to each couchdb document based on data in a csv file.
the value of the links property is to be an array of dicts containing the couchdb _id of the linked document and the linkType
When I run the script i get a links error (see error info below)
I am not sure how to create the dict key links if it doesn't exist and add the link data, or otherwise append to the links array if it does exist.
an example of a document with the links will look like this:
{
_id: p_3,
name: 'Smurfette'
links: [
{to_id: p_2, linkType: 'knows'},
{to_id: o_56, linkType: 'follows'}
]
}
python script for processing the csv file:
#!/usr/bin/python
# coding: utf-8
# Version 1
#
# csv fields: ID,fromType,fromID,toType,toID,LinkType,Directional
import csv, sys, couchdb
def csv2couchLinks(database, csvfile):
# CouchDB Database Connection etc
server = couchdb.Server()
#assumes that couchdb runs on http://localhost:5984
db = server[database]
#assumes that db is already created
# CSV file
data = csv.reader(open(csvfile, "rb")) # Read in the CSV file rb=read/binary
csv_links= csv.DictReader(open(csvfile, "rb"))
def makeLink(from_id, to_id, linkType):
# get doc from db
doc = db[from_id]
# construct link object
link = {'to_id':to_id, 'linkType':linkType}
# add link reference to array at key 'links'
if doc['links'] in doc:
doc['links'].append(link)
else:
doc['links'] = [link]
# update the record in the database
db[doc.id] = doc
# read each row in csv file
for row in csv_links:
# get entityTypes as lowercase and entityIDs
fromType = row['fromType'].lower()
fromID = row['fromID']
toType = row['toType'].lower()
toID = row['toID']
linkType = row['LinkType']
# concatenate 'entity type' and 'id' to make couch '_id'
fromIDcouch = fromType[0]+'_'+fromID #eg 'p_2' <= person 2
toIDcouch = toType[0]+'_'+toID
makeLink(fromIDcouch, toIDcouch, linkType)
makeLink(toIDcouch, fromIDcouch, linkType)
# Run csv2couchLinks() if this is not an imported module
if __name__ == '__main__':
DATABASE = sys.argv[1]
CSVFILE = sys.argv[2]
csv2couchLinks(DATABASE,CSVFILE)
error info:
$ python LINKS_csv2couchdb_v1.py "qmhonour" "./tablesAsCsv/links.csv"
Traceback (most recent call last):
File "LINKS_csv2couchdb_v1.py", line 65, in <module>
csv2couchLinks(DATABASE,CSVFILE)
File "LINKS_csv2couchdb_v1.py", line 57, in csv2couchLinks
makeLink(fromIDcouch, toIDcouch, linkType)
File "LINKS_csv2couchdb_v1.py", line 33, in makeLink
if doc['links'] in doc:
KeyError: 'links'

Another option is condensing the if block to this:
doc.setdefault('links', []).append(link)
The dictionary's setdefault method checks to see if links exists in the dictionary, and if it doesn't, it creates a key and makes the value an empty list (the default). It then appends link to that list. If links does exist, it just appends link to the list.
def makeLink(from_id, to_id, linkType):
# get doc from db
doc = db[from_id]
# construct link object
link = {'to_id':to_id, 'linkType':linkType}
# add link reference to array at key 'links'
doc.setdefault('links', []).append(link)
# update the record in the database
db[doc.id] = doc

Replace:
if doc['links'] in doc:
With:
if 'links' in doc:

Related

Python 'list' object has no attribute 'keys' when trying to write a row in CSV file

I am trying to write a new row into a CSV file and I can't because I get an error in Python Shell.
Below is the code I am using (I am reading JSON from API and want to put data into CSV file)
# import urllib library
from urllib.request import Request, urlopen
c=1
# import json
import json
# store the URL in url as
# parameter for urlopen
import pandas as pd
import csv
headerList = ['name','id','order','height','weight','speed','special_defense','special_attack','defense','attack','hp']
# open CSV file and assign header
with open("pokemon_stats.csv", 'w') as file:
dw = csv.DictWriter(file, delimiter=',',
fieldnames=headerList)
dw.writeheader()
# display csv file
fileContent = pd.read_csv("pokemon_stats.csv")
for r in range(1,3):
req = Request('https://pokeapi.co/api/v2/pokemon/'+str(r)+'/', headers={'User-Agent': 'Chrome/32.0.1667.0'})
# store the response of URL
response = urlopen(req)
# storing the JSON response
# from url in data
data_json = json.loads(response.read())
#print(data_json)
for key, value in data_json.items():
if key=='name':
name=value
elif key=='id':
id=value
elif key=='order':
order=value
elif key=='height':
height=value
elif key=='weight':
weight=value
elif key == 'stats':
for sub in data_json['stats']:
for i in sub:
if i=='base_stat':
base_stat=sub[i]
if i=='stat':
for j in sub[i]:
if j=='name':
stat_name=sub[i][j]
if stat_name=='hp':
hp=base_stat
elif stat_name=='attack':
attack=base_stat
elif stat_name=='defense':
defense=base_stat
elif stat_name=='special-attack':
special_attack=base_stat
elif stat_name=='special-defense':
special_defense=base_stat
elif stat_name=='speed':
speed=base_stat
data = [name,id,order,height,weight,speed,special_defense,special_attack,defense,attack,hp]
dw.writerow(data)
After I try the execution of this code I get an error as it follows:
Traceback (most recent call last):
File "C:/Users/sbelcic/Desktop/NANOBIT_API.py", line 117, in <module>
dw.writerow(data)
File "C:\Users\sbelcic\AppData\Local\Programs\Python\Python37\lib\csv.py", line 155, in writerow
return self.writer.writerow(self._dict_to_list(rowdict))
File "C:\Users\sbelcic\AppData\Local\Programs\Python\Python37\lib\csv.py", line 148, in _dict_to_list
wrong_fields = rowdict.keys() - self.fieldnames
AttributeError: 'list' object has no attribute 'keys'*
Can somebody pls help and tell me what I am doing wrong.
I don't have working experience of manipulating JSON response with Python so any comments are welcome. If someone sees a better way to do this he is welcome to share.
Since dw is a DictionaryWriter, data needs to be a dictionary (currently it's a list) as seen in the documentation.
Convert data to a dictionary with your headers
data = [name,id,order,height,weight,speed,special_defense,special_attack,defense,attack,hp]
data = dict(zip(headerList, data))
dw.writerow(data)
Check the example for using the DictWriter. You need to pass a dictionary to writerow instead of a list, so your last line should be
data =['name':name,'id': id,'order':order,'height': height,'weight':weight,'speed':speed,'special_defense':special_defense,'special_attack':special_attack,'defense':defense,'attack':attack,'hp':hp]
dw.writerow(data)
Note that your whole code can also be simplified if you populate the data dictionary instead of all your if/else:
data={} #empty dictionary
#First extract everything that is on the main level of your dict
for key in ("name", "id", "order", "height", "weight":
if key in data_json:
data[key]=data_json[key]
#Check if the "stats" dict exists in your JSON data
if 'stats' in data_json:
if 'base_stat' in data_json['stats']:
data['base_stat']=data_json['stats']['base_stat']
if 'stat' in data_json['stats']:
statDict = data_json['stats']['stat']
for key in ['hp', 'attack', 'defense', 'special-attack', 'special-defense', 'speed']:
if key in statDict:
data[key]=statDict[key]
Notes:
I did not test this code, check it carefully, but I hope you get the idea
You could add else to all if key in checks to include an error message if a stat is missing
If you are sure that all keys will always be present, then you can skip a few of the if checks
I'm going to ignore the actual error that got you here, and instead propose a radical restructure: I think your code will be simpler and easier to reason about.
I've looked at the JSON returned from that Pokemon API and I can see why you started down the path you did: there's a lot of data, and you only need a small subset of it. So, you're going through a lot of effort to pick out exactly what you want.
The DictWriter interface can really help you here. Consider this really small example:
header = ['name', 'id', 'order']
with open('output.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=header)
writer.writeheader()
writer.writerow({'name': 'bulbasaur', 'id': 1, 'order': 1, 'species': {}})
Maybe you've run something like this before and got this error:
ValueError: dict contains fields not in fieldnames: 'species'
because the JSON you loaded has keys you didn't include when you created your writer... because you don't want them. And then, maybe you figured, "well, that means I've got to be very selective about what I put in the dict before passing to writerow()?
Since you've already defined which keys you care about for the header, use those keys to pull out what you want from the JSON:
header = ['name', 'id', 'order', 'height', 'weight',
'speed', 'special-defense', 'special-attack',
'defense', 'attack', 'hp']
all_data = json.load(open('1.json')) # bulbasaur, I downloaded this from the API URL
my_data = {}
for key in header:
my_data[key] = all_data.get(key) # will return None for sub-stats keys, which is okay for now
writer = csv.DictWriter(sys.stdout, fieldnames=header)
writer.writeheader()
writer.writerow(my_data)
The get(key_name) method on a dict (the JSON data) will try to find that key in the dict and return that key's value. If the key isn't found, None is returned. Running that I get the following CSV (the sub-stat columns are empty, as expected):
name,id,order,height,weight,speed,special_defense,special_attack,defense,attack,hp
bulbasaur,1,1,7,69,,,,,,
This has the same effect as your "if this key, then this value" statements, but it's driven by the data (header names) you already defined.
On to the sub-stats...
I think it's safe to assume that if there is a stats key in the JSON, each "stat object" in the list of stats will have the data you want. It's important to make sure you're only copying the stats you've specified in header; and again, you can use your data to drive the process:
for stat in all_data['stats']:
stat_name = stat['stat']['name']
if stat_name not in header:
continue # skip this sub-stat, no column for it in the CSV
base_stat = stat['base_stat']
my_data[stat_name] = base_stat
When I insert that loop, I now get this for my CSV output:
name,id,order,height,weight,speed,special_defense,special_attack,defense,attack,hp
bulbasaur,1,1,7,69,45,,,49,49,45
Some stats are populated, but some, the "special" stats are blank? That's because in your header you've named them like special_attack (with underscore) but in reality they're like special-attack (with hyphen). I fixed your header, and now I get:
name,id,order,height,weight,speed,special-defense,special-attack,defense,attack,hp
bulbasaur,1,1,7,69,45,65,65,49,49,45
Those are all the pieces you need. To put it together, I recommend the following structure... I'm a big fan of breaking up a process like this into distinct tasks: get all the data, then process all the data, then write all the processed data. It makes debugging easier, and less indentation of code:
# Make all API calls and record their JSON
all_datas = []
# loop over your API calls:
# make the request
# get the JSON data
# append JSON data to all_datas
# Process/transform the API JSON into what you want
my_data_rows = []
for all_data in all_datas:
my_data_row = {}
for key in header:
my_data_row[key] = all_data.get(key)
for stat in all_data['stats']:
stat_name = stat['stat']['name']
if stat_name not in header:
continue # skip this sub-stat
base_stat = stat['base_stat']
my_data[stat_name] = base_stat
# Write your transformed data to CSV
writer = csv.DictWriter(sys.stdout, fieldnames=header)
writer.writeheader()
writer.writerows(my_data_rows)

Error on Python DB Connection PostgreSQL (Module)

Currently, I am learning about databases with Python.
I am trying to make postgresql database connection under the OOP paradigm.
I followed all the steps from this article, but I got an error when I run my code.
All the code is the same, I just modified the database.ini file with my DB setting.
** database.ini code:
[postgresql]
host = localhost
port = 5432
database = dvdrental
user = postgres
password = 1234
** config.py code:
# import libraries
from configparser import ConfigParser
import configparser
from pathlib import Path
def get_project_root() -> Path:
''' Return project root folder '''
return Path(__file__).parents[1]
def config(config_db):
section = 'postgresql'
config_file_path = 'config/' + config_db
if (len(config_file_path) > 0 and len(section) > 0):
# create an instance of ConfigParser class
config_parser = ConfigParser()
# read the configuration file
config_parser.read(config_file_path)
# if the configuration file contains the provided section name
if(config_parser.has_section(section=section)):
# read options of the sections
config_params = config_parser.items(section=section)
# convert the list object to a python dictionary object
# define an empty dict
db_conn_dict = {}
# loop in the list
for config_param in config_params:
# get options key and value
key = config_params[0]
value = config_params[1]
# add the key value pair in the dictionary object
db_conn_dict[key] = value
# get connection object use above dictionary object
return db_conn_dict
** db_conn.py code:
# import libraries
import pandas as pd
import psycopg2
from config.config import config
# take in a PostgreSQL table and outputs a pandas dataframe
def load_db_table(config_db, query):
params = config(config_db)
engine = psycopg2.connect(**params)
data = pd.read_sql(query, con = engine)
return data
** main.py code:
# import library
from src.data.db_conn import load_db_table
from config.config import get_project_root
# project root
PROJECT_ROOT = get_project_root()
# read database
df = load_db_table(config_db = 'database.ini', query = 'SELECT * FROM actor LIMIT 5')
print(df)
The problem is, when I run the program I got the error:
TypeError: connect() keywords must be strings
PS D:\ASUS\MY CODES PYTHON\Iochordxsyy\db_connection> python main.py
Traceback (most recent call last):
File "main.py", line 9, in <module>
df = load_db_table(config_db = 'database.ini', query = 'SELECT * FROM actor LIMIT 5')
File "D:\ASUS\MY CODES PYTHON\Iochordxsyy\db_connection\src\data\db_conn.py", line 9, in load_db_table
engine = psycopg2.connect(**params)
TypeError: connect() keywords must be strings
This is the message when I debugged my code:
Exception has occurred: TypeError
connect() argument after ** must be a mapping, not NoneType
File "D:\ASUS\MY CODES PYTHON\Iochordxsyy\db_connection\src\data\db_conn.py", line 9, in load_db_table
engine = psycopg2.connect(**params)
File "D:\ASUS\MY CODES PYTHON\Iochordxsyy\db_connection\main.py", line 9, in <module>
df = load_db_table(config_db = 'database.ini', query = 'SELECT * FROM actor LIMIT 5')
I have checked all the code is the same as the article but I have no ideas why the error still occurs. Do you have any ideas?
If you have any ideas/solutions, it will be much appreciated.
Thank you.
Credit:
Thank you for the authors of the articles mentioned above.
I think "username" keyword in database.ini must be "user".
** database.ini code:
[postgresql]
host = localhost
port = 5432
database = dvdrental
user = postgres
password = 1234
[Solved]
Hi all, I got the answer. There is a mistake script on the config.ini file.
I change the script from "config_params[]" to "config_param[]"
** the correct script:
...
key = config_param[0]
value = config_param[1]
...
Thank you for all the comments.
Align the indentation of this: config.py, as shown below.
if (len(config_file_path) > 0 and len(section) > 0):
# create an instance of ConfigParser class
config_parser = ConfigParser()
# read the configuration file
config_parser.read(config_file_path)
# if the configuration file contains the provided section name
if(config_parser.has_section(section=section)):
# read options of the sections
config_params = config_parser.items(section=section)
# convert the list object to a python dictionary object
# define an empty dict
db_conn_dict = {}
# loop in the list
for config_param in config_params:
# get options key and value
key = config_params[0]
value = config_params[1]
# add the key value pair in the dictionary object
db_conn_dict[key] = value
# get connection object use above dictionary object
return db_conn_dict

snowflake external table from a list of jsons

got a file.json in my s3 bucket, it contains a list of jsons,
for instance when I download it and parse it with python json load I get a list:
[{'k': 'calendar#event'}, {'k': 'calendar#event'}]\
loading it into an external table works:
create external table if not exists TEST_111
with location = #TESt111
auto_refresh = true
file_format = (type = json);
but instead of getting a table with 2 rows, I get one row with a list in it,
any ideas?
If the value is provided as array then strip_outer_array could be used:
create external table if not exists TEST_111
with location = #TESt111
auto_refresh = true
file_format = (type = json, STRIP_OUTER_ARRAY=TRUE);
Additionally if the json keys are known in advance, they could be exposed as columns directly in external table's definition:
create external table if not exists TEST_111
(
filename TEXT metadata$filename
,k TEXT AS (value:"k"::TEXT)
)
with location = #TESt111
auto_refresh = true
file_format = (type = json, STRIP_OUTER_ARRAY=TRUE);

add parameters inside url

i want to build some function that read a url from txt file, then save it to some variable, then add some values inside the url between another values
example of the url: https://domains.livedns.co.il/API/DomainsAPI.asmx/NewDomain?UserName=apidemo#livedns.co.il&Password=demo
lets say i want to inject some values between UserName and Password and save it into file again and use it later.
i started to write the function and play with urllib parser but i still doesnt understand how to do that.
what i tried until now:
def dlastpurchase():
if os.path.isfile("livednsurl.txt"):
apikeyfile = open("livednsurl.txt", "r")
apikey = apikeyfile.read()
url_parse = urlsplit(apikey)
print(url_parse.geturl())
dlastpurchase()
Thanks in advance for every tip and help
A little bit more complex example that I believe you will find interesting and also enjoy improving it (while it takes care of some scenarios, it might be lacking in some). Also functional to enable reuse in other cases. Here we go
assuming we have a text file, named 'urls.txt' that contains this url
https://domains.livedns.co.il/API/DomainsAPI.asmx/NewDomain?UserName=apidemo#livedns.co.il&Password=demo
from os import error
from urllib.parse import urlparse, parse_qs, urlunparse
filename = 'urls.txt'
function to parse the url and return its query parameters as well as the url object, which will be used to reconstruct the url later on
def parse_url(url):
"""parse a given url and return its query parameters
Args:
url (string): url string to parse
Returns:
parsed (tupple): the tupple object returned by urlparse
query_parameters (dictionary): dictionary containing the query parameters as keys
"""
try :
# parse the url and get the queries parameters from there
parsed = urlparse(url)
# parse the queries and return the dictionary containing them
query_result = parse_qs(parsed.query)
return (query_result, parsed)
except(error):
print('something failed !!!')
print(error)
return False
function to add a new query parameter or to replace an existing one
def insert_or_replace_word(query_dic, word,value):
"""Insert a value for the query parameters of a url
Args:
query_dic (object): the dictionary containing the query parameters
word (string): the query parameter to replace or insert values for
value (string): the value to insert or use as replacement
Returns:
result (string):the result of the insertion or replacement
"""
try:
query_dic[word] = value
return query_dic
except (error):
print('Something went wrong {0}'.format(error))
function to format the query parameter and get them ready to reconstruct the new url
def format_query_strings(query_dic):
"""format the final query dictionaries ready to be used to construct a new url and construct the new url
Args:
query_dic (dictionary): final query dictionary after insertion or update
"""
final_string = ''
for key, value in query_dic.items():
#unfortunatly, query params from parse_qs are in list, so remove them before creating the final string
if type(value) == list:
query_string = '{0}={1}'.format(key, value[0])
final_string += '{0}&'.format(query_string)
else:
query_string = '{0}={1}'.format(key, value)
final_string += '{0}&'.format(query_string)
# this is to remove any extra & inserted at the end of the loop above
if final_string.endswith('&'):
final_string = final_string[:len(final_string)-1]
return final_string
we check out everything works by reading in text file, performing above operation and then saving the new url to a new file
with open(filename) as url:
lines = url.readlines()
for line in lines:
query_params,parsed = parse_url(line)
new_query_dic = insert_or_replace_word(query_params,'UserName','newUsername')
final = format_query_strings(new_query_dic)
#here you have to pass an iterable of lenth 6 in order to reconstruct the url
new_url_object = [parsed.scheme,parsed.netloc,parsed.path,parsed.params,final,parsed.fragment]
#this reconstructs the new url
new_url = urlunparse(new_url_object)
#create a new file and append the link inside of it
with open('new_urls.txt', 'a') as new_file:
new_file.writelines(new_c)
new_file.write('\n')
You don't have to use fancy tools to do that. Just split the url based on "?" Character. Then, split the second part based on "&" character. Add your new params to the list you have, and merge them with the base url you get.
url = "https://domains.livedns.co.il/API/DomainsAPI.asmx/NewDomain?UserName=apidemo#livedns.co.il&Password=demo"
base, params = url.split("?")
params = params.split("&")
params.insert(2, "new_user=yololo&new_passwd=hololo")
for param in params:
base += param + "&"
base = base.strip("&")
print(base)
I did it like this since you asked for inserting to a specific location. But url params are not depends on the order, so you can just append at the end of the url for ease. Or, you can edit the parameters from the list I show.

Python & JSON | Can't access dictionary using an index

I have a program that looks like this:
import json
import requests
article_name = "BT Centre"
article_api_url = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro=&explaintext=&titles={}".format(article_name)
called_data = requests.get(article_api_url)
formatted_data = called_data.json()
print(formatted_data)
pages = formatted_data["query"]["pages"]
print(pages)
first_page = pages[0]["extract"]
print(first_page)
For the first print statement, where it prints the whole JSON, it returns this:
{
'batchcomplete': '',
'query':{
'pages':{
'18107207':{
'pageid': 18107207,
'ns': 0,
'title':'BT Centre',
'extract': "The BT Centre is the global headquarters and registered office of BT Group..."
}
}
}
}
When I try to access the "extract" data with the first_page variable, it returns:
Traceback (most recent call last):
File "wiki_json_printer.py", line 15, in <module>
first_page = pages[0]["extract"]
KeyError: 0
The problem is, I can't set first_page to pages["18107207"]["extract"] because the Page ID changes for every article.
Edit: Solution from Ann Zen works:
You can use a for loop to loop through the keys of the pages
dictionary, and detect which one is the ID via the str.isdigit()
method:
for key in pages:
if key.isdigit():
print(pages[key]["extract"])
You can use a for loop to loop through the keys of the pages dictionary, and detect which one is the ID via the str.isdigit() method:
for key in pages:
if key.isdigit():
print(pages[key]["extract"])
You could use next on an iterator on the dict to find the first key:
...
key = next(iter(pages))
first_page = pages[key]["extract"]
...
pages is dictionary not a list you can't select it by index, use it key
print(pages['18107207']['extract'])
of course the following will work because the key is 18107207
for key in pages:
print(pages[key]["extract"])

Categories