how to change a specific records in Mongodb - python

I have a database in MongoDb and it contains a collection called users that has information about people like name,job title,gender,etc.the problem is genders are wrong in most cases I mean the name is a male name but the gender is female although it had to be male.
I would be happy if you could help me find a way to write a python code to fix the problem.
I have two files containing names of females and names of males and here is the code I have written but it is not working since some of the records remain un changed and I do not know what is wrong:
"""
import json
import sys
from pymongo import MongoClient
from tqdm import tqdm
client = MongoClient("mongodb://192.168.20.92:27017")
users = client["97_Production_DB"]["invalid"]
updated_users_sepid = client["db_97_updated"]["users"]
invalid_users_sepid = client["db_97_updated"]["invalid"]
with open("male.json", "rb") as f:
male = json.load(f)
with open("female.json", "rb") as f:
female = json.load(f)
person = {}
for key in male:
person[key] = True
for key in female:
person[key] = False
for data in tqdm(users.find(no_cursor_timeout=True)):
firstName = data["firstName"].replace("سید", "").replace("سیده", "").replace("سادات", "").replace(" ", "")
if firstName in person.keys:
data["gender"] = person.get(firstName)
updated_users_sepid.insert_one(data)
else:
invalid_
"""

Related

Cleaning .csv text data in Python

I have recently created a python program that would import my finances from a .csv file and transfer it onto google sheets. However, I am struggling to figure out how to fix the names that my bank gives me.
Example:
ME DC SI XXXXXXXXXXXXXXXX NETFLIX should just be NETFLIX,
POS XXXXXXXXXXXXXXXX STEAM PURCHASE should just be STEAM and so on
Forgive me if this is a stupid question as I am a newbie when it comes to coding and I am just looking to use it to automate certain situations in my life.
import csv
from unicodedata import category
import gspread
import time
MONTH = 'June'
# Set month name
file = f'HDFC_{MONTH}_2022.csv'
#the file we need to extract data from
transactions = []
# Create empty list to add data to
def hdfcFin(file):
'''Create a function that allows us to export data to google sheets'''
with open(file, mode = 'r') as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
date = row[0]
name = row[1]
expense = float(row[2])
income = float(row[3])
category = 'other'
transaction = ((date, name, expense, income, category))
transactions.append(transaction)
return transactions
sa = gspread.service_account()
# connect json to api
sh = sa.open('Personal Finances')
wks = sh.worksheet(f'{MONTH}')
rows = hdfcFin(file)
for row in rows:
wks.insert_row([row[0], row[1], row[4], row[2], row[3]], 8)
time.sleep(2)
# time delay because of api restrictions
If you dont have specific format to identify the name then you can use below logic. Which will have key value pair. If key appears in name then you can replace it with value.
d={'ME DC SI XXXXXXXXXXXXXXXX NETFLIX':'NETFLIX','POS XXXXXXXXXXXXXXXX STEAM PURCHASE':'STEAM'}
test='POS XXXXXXXXXXXXXXXX STEAM PURCHASE'
if test in d.keys():
test=d[test]
print(test)
Output:
STEAM
If requirement is to fetch only last word out of your name then you can use below logic.
test='ME DC SI XXXXXXXXXXXXXXXX NETFLIX'
test=test.split(" ")[-1]
print(test)
Output:
NETFLIX

azure.cognitiveservices.vision.face.models._models_py3.APIErrorException: (InvalidImageSize) Image size is too small

ERROR MESSAGE
I am trying to upload images of reasonable size(around 20KB). But according to documentation image of size 1KB to 6MB can be uploaded. I hope there is some part of the program that needs modification to rectify the error.
File "add_person_faces.py", line 46, in <module>
res = face_client.person_group_person.add_face_from_stream(global_var.personGroupId, person_id, img_data)
File "C:\Python\Python36\lib\site-packages\azure\cognitiveservices\vision\face\operations\_person_group_person_operations.py", line 785, in add_face_from_stream
raise models.APIErrorException(self._deserialize, response)
azure.cognitiveservices.vision.face.models._models_py3.APIErrorException: (InvalidImageSize) Image size is too small.
CODE
import os, time
import global_variables as global_var
from azure.cognitiveservices.vision.face import FaceClient
from msrest.authentication import CognitiveServicesCredentials
from azure.cognitiveservices.vision.face.models import TrainingStatusType, Person, SnapshotObjectType, OperationStatusType
import urllib
import sqlite3
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
KEY = global_var.key
ENDPOINT = 'https://centralindia.api.cognitive.microsoft.com'
face_client = FaceClient(ENDPOINT,CognitiveServicesCredentials(KEY))
def get_person_id():
person_id = ''
extractId = str(sys.argv[1])[-2:]
connect = sqlite3.connect("Face-DataBase")
c = connect.cursor()
cmd = "SELECT * FROM Students WHERE ID = " + extractId
c.execute(cmd)
row = c.fetchone()
person_id = row[3]
connect.close()
return person_id
if len(sys.argv) is not 1:
currentDir = os.path.dirname(os.path.abspath(__file__))
imageFolder = os.path.join(currentDir, "dataset/" + str(sys.argv[1]))
person_id = get_person_id()
for filename in os.listdir(imageFolder):
if filename.endswith(".jpg"):
print(filename)
img_data = open(os.path.join(imageFolder,filename), "rb")
res = face_client.face.detect_with_stream(img_data)
if not res:
print('No face detected from image {}'.format(filename))
continue
res = face_client.person_group_person.add_face_from_stream(global_var.personGroupId, person_id, img_data)
print(res)
time.sleep(6)
else:
print("supply attributes please from dataset folder")
After looking through the API calls you are making, I realized there are some elements missing. You might not have posted the entire code, but I will add a sample below that illustrates the steps. Using the steps avoids any image errors, so the 'wrong size' image error could have likely been from missing steps.
In your code, before you can add an image to a Person Group Person (PGP), you have to create a Person Group(PG) for that PGP to belong to. Then after you create the Person Group (it is empty at the start), you must create a Person Group Person with that PG ID in it. Once those two things are created, then you can start adding images to your Person Group Person.
So here are the steps summarized above:
Create a Person Group with the API call create()
Create a Person Group Person with its API call for create()
Add your image(s) to the Person Group Person with the API call add_face_from_stream()
Once you have added all your images that belong to your Person Group Person, then you can use data from it however you like.
See the code sample below, where a single local image is uploaded and added to a Person Group Person. I'll include the image I am using if you wanted to download and test.
import os
from azure.cognitiveservices.vision.face import FaceClient
from msrest.authentication import CognitiveServicesCredentials
KEY = os.environ['FACE_SUBSCRIPTION_KEY']
ENDPOINT = os.environ['FACE_ENDPOINT']
face_client = FaceClient(ENDPOINT, CognitiveServicesCredentials(KEY))
person_group_id = 'women_person_group'
person_id = 'women_hats'
image_name = 'woman_with_sunhat.jpg'
# Create empty Person Group. Person Group ID must be lower case, alphanumeric, and/or with '-', '_'.
print('Creating a Person Group:', person_group_id)
face_client.person_group.create(person_group_id=person_group_id, name=person_group_id)
# Create a Person Group Person.
print('Creating the Person Group Person:', person_id)
women_hat_group = face_client.person_group_person.create(person_group_id, person_id)
# Add image to our Person Group Person.
print('Adding face to the Person Group Person:', person_id)
face_image = open(image_name, 'r+b')
face_client.person_group_person.add_face_from_stream(person_group_id, women_hat_group.person_id, face_image)
# Print ID from face.
print('Person ID:', women_hat_group.person_id)
# Since testing, delete the Person Group, so no duplication conflicts if script is run again.
face_client.person_group.delete(person_group_id)
print()
print("Deleted the person group {} from the Azure Face account.".format(person_group_id))

How to print specific key:value pairs from a pickled dictionary

I made a pickled dictionary, now I only want to retrieve the values associated with a user-inputted country.
import pickle
choice = input("Choose a country: ")
choice.capitalize()
file_name = "nationDict.dat"
fileObject = open(file_name, 'rb')
countries = pickle.load(fileObject)
for choice in countries:
print(choice)
and I use this classmethod to create the dictionary
#classmethod
def dictMaker(cls):
dictCont = {}
dictPop = {}
dictArea = {}
dictDensity = {}
for i in range(193):
dictCont[Nation.country[i]] = Nation.continent[i]
dictPop[Nation.country[i]] = Nation.population[i]
dictArea[Nation.country[i]] = Nation.area[i]
dictDensity[Nation.country[i]] = Nation.density[i]
with open("nationDict.dat", 'wb') as pickUN:
pickle.dump((dictCont, dictPop, dictArea, dictDensity), pickUN, protocol=pickle.HIGHEST_PROTOCOL)
I want to get data only for the country of choice, but I don't understand how. I end up getting the data for every country, I do get the 4 different sets of info I want though, but I want it for only 1 country. Everything I look up is about printing entire dictionaries, but I can't find anything talking about individual values only. I've tried just about every keyword to find things on this site.
I would consider storing your country data in a different form, such as nested dictionary:
import pickle
countries = {
Nation.country[i]: {
"continent": Nation.continent[i],
"population": Nation.population[i],
"area": Nation.area[i],
"density": Nation.density[i],
}
for i in range(193)
}
# Now you can pickle only one object:
with open("nation_dict.dat", "wb") as fh:
pickle.dump(countries, fh, protocol=pickle.HIGHEST_PROTOCOL)
And your script becomes:
import pickle
choice = input("Choose a country: ")
choice.capitalize()
file_name = "nationDict.dat"
with (file_name, 'rb') as fh:
countries = pickle.load(fileObject)
print(countries.get(choice))
# {"continent": "Europe", "population": 123456789, "area": 12345, "density": 12345}
Once your script is working I recommend posting on Code Review.
for countryDict in countries:
print(countryDict[choice])
Should do the trick. The variable that you have defined as countries is actually a tuple of dictionaries (dictCont, dictPop, dictArea, dictDensity). So the for loop iterates over each of those dicts and then gets the country of choice from them. In this case, countries is a poor name choice. I had read it and assumed it was a single dictionary with an array of values, as I was too lazy to read your second code block. As a rule of thumb, always assume other coders are lazy. Trust me.

Python - Replacing a specific value in a CSV file while keeping the rest

So I have a CSV file that looks something like this:
Username,Password,Name,DOB,Fav Artist,Fav Genre
Den1994,Denis1994,Denis,01/02/1994,Eminem,Pop
Joh1997,John1997,John,03/04/1997,Daft Punk,House
What I need to be able to do is let the user edit and change their Fav Artist and Fav Genre so that their new values are saved to the file in place of the old ones. I'm not the very advanced when it comes to CSV so I'm not sure where to begin with it, therefore any help and pointers will be greatly appreciated.
Thanks guys.
EDIT:
Adding the code I have so far so it doesn't seem like I'm just trying to get some easy way out of this, generally not sure what to do after this bit:
def editProfile():
username = globalUsername
file = open("users.csv", "r")
for line in file:
field = line.split(",")
storedUsername = field[0]
favArtist = field[4]
favGenre = field[5]
if username == storedUsername:
print("Your current favourite artist is:", favArtist,"\n" +
"Your current favourite genre is:",favGenre,"\n")
wantNewFavArtist = input("If you want to change your favourite artist type in Y, if not N: ")
wantNewFavGenre = input("If you want to change your favourite genre type in Y, if not N: ")
if wantNewFavArtist == "Y":
newFavArtist = input("Type in your new favourite artist: ")
if wantNewFavGenre == "Y":
newFavGenre = input("Type in your new favourite genre: ")
This is how it would look like using pandas
import pandas as pd
from io import StringIO
# Things you'll get from a user
globalUsername = "Den1994"
field = 'Fav Artist'
new_value = 'Linkin Park'
# Things you'll probably get from a data file
data = """
Username,Password,Name,DOB,Fav Artist,Fav Genre
Den1994,Denis1994,Denis,01/02/1994,Eminem,Pop
Joh1997,John1997,John,03/04/1997,Daft Punk,House
"""
# Load your data (e.g. from a CSV file)
df = pd.read_csv(StringIO(data)).set_index('Username')
print(df)
# Now change something
df.loc[globalUsername][field] = new_value
print(df)
Here df.loc[] allows you to access a row by the index. In this case Username is set as index. Then, [field] selects the column in that row.
Also, consider this:
df.loc[globalUsername][['Fav Artist', 'Fav Genre']] = 'Linkin Park', 'Nu Metal'
In case you have a my-data.csv file you can load it with:
df = pd.read_csv('my-data.csv')
The code above will return
Password Name DOB Fav Artist Fav Genre
Username
Den1994 Denis1994 Denis 01/02/1994 Eminem Pop
Joh1997 John1997 John 03/04/1997 Daft Punk House
and
Password Name DOB Fav Artist Fav Genre
Username
Den1994 Denis1994 Denis 01/02/1994 Linkin Park Pop
Joh1997 John1997 John 03/04/1997 Daft Punk House
Try this
import pandas as pd
data = pd.read_csv("old_file.csv")
data.loc[data.Username=='Den1994',['Fav Artist','Fav Genre']] = ['Beyonce','Hard rock']
data.to_csv('new_file.csv',index=False)
python has a built-in module dealing with csv, there are examples in the docs that will guide you right.
One way to do is to use the csv module to get the file you have into a list of lists, then you can edit the individual lists (rows) and just rewrite to disk what you have in memory.
Good luck.
PS: in the code that you have posted there is no assignment to the "csv in memory" based on the user-input
a minimal example without the file handling could be:
fake = 'abcdefghijkl'
csv = [list(fake[i:i+3]) for i in range(0, len(fake), 3)]
print(csv)
for row in csv:
if row[0] == 'd':
row[0] = 'changed'
print(csv)
the file handling is easy to get from the docs, and pandas dependance is avoided if that is on the wishlist

Python Parsing with lxml

I've created the following scraper for NFL play-by-play data. It writes the results to a csv file and does everything I need it to except I don't know how to attach a column for who actually has possession of the ball in each line of the csv file.
I can grab the text from the "home" and "away" <tr> tag to show who is playing in the game for query purposes later, but I need the scraper to recognize when possession changes (goes from home to away or vice versa). I'm fairly new to Python and have tried different indention but I don't think that's the issue. Any help would be greatly appreciated. I feel like the answer is beyond my scope of understanding.
I also realize that my code probably isn't the most Pythonic but I'm still learning. I'm using Python 2.7.9.
import lxml
from lxml import html
import csv
import urllib2
import re
game_date = raw_input('Enter game date: ')
data_html = 'http://www.cbssports.com/nfl/gametracker/playbyplay/NFL_20160109_PIT#CIN'
url = urllib2.urlopen(data_html).read()
data = lxml.html.fromstring(url)
plays = data.cssselect('tr#play')
home = data.cssselect('tr#home')
away = data.cssselect('tr#away')
csvfile = open('C:\\DATA\\PBP.csv', 'a')
writer = csv.writer(csvfile)
for play in plays:
frame = []
play = play.text_content()
down = re.search(r'\d', play)
if down == None:
pass
else:
down = down.group()
dist = re.search(r'-(\d+)', play)
if dist == None:
pass
else:
dist = dist.group(1)
field_end = re.search(r'[A-Z]+', play)
if field_end == None:
pass
else:
field_end = field_end.group()
yard_line = re.search(r'[A-Z]+([\d]+)', play)
if yard_line == None:
pass
else:
yard_line = yard_line.group(1)
desc = re.search(r'\s(.*)', play)
if desc == None:
pass
else:
desc = desc.group()
time = re.search(r'\((..*\d)\)\s', play)
if time == None:
pass
else:
time = time.group(1)
for team in away:
teamA = team.text_content()
teamA = re.search(r'(\w+)\s', teamA)
teamA = teamA.group(1)
teamA = teamA.upper()
for team in home:
teamH = team.text_content()
teamH = re.search(r'(\w+)\s', teamH)
teamH = teamH.group(1)
teamH = teamH.upper()
frame.append(game_date)
frame.append(down)
frame.append(dist)
frame.append(field_end)
frame.append(yard_line)
frame.append(time)
frame.append(teamA)
frame.append(teamH)
frame.append(desc)
writer.writerow(frame)
csvfile.close()
I guess you need to append another value to the frame, for each row, which an indication of whether the possession changed.
After:
frame.append(desc)
add:
if teamA == teamH:
frame.append("Same possession")
else:
frame.append("Changed possession")
(note this assumes the team names are consistent, no extra spaces/padding/formatting in the teamA/teamH values).
You don't have to use strings, for example you could put 0 for no change and 1 for a change of possession.
HTH
Barny

Categories