insert array into mongodb using pymongo - python

I am trying to add array into mongdb using pymongo
I have another program that will return something like
['1 aksdfjas;dkfjsa;dfkj','2 ;alksdjf;askdjf;asdfjkasdf', '3 ;alksdfj;asdlkfj;asdfj']
and I want to add them into the insert.
1)I cannot think of any other ways to do it so I am converting them to string and concatenate and trying to add them to the post(there must be better way no?)
2)When I do this, instead of desire affect, I get
["'1 aksdfjas;dkfjsa;dfkj','2 ;alksdjf;askdjf;asdfjkasdf', '3 ;alksdfj;asdlkfj;asdfj'",]
That extra quotes.. how can I correct this?
import pymongo
import time
import datetime
from random import *
from pymongo import MongoClient
client = MongoClient('mongodb://user:abc123#10.0.0.1:27017')
stringToStuff = 'blabh blah blahhhhh'
def createLoop():
return randint(5,15)
def tGenerator(e):
returnString = ''
for i in range(e):
returnString += "'" + str(i+1) + " " + stringToStuff + "',"
return returnString
db = client['pytest']
collection = db['test']
names = db.test.find()
collection2 = db['pytestResult']
for p in names:
print(p['name'])
name2 = p['name']
#post = {"name":name2,"score":8,"date":datetime.datetime.now()}
post = {
"name":name2,
"score":8,
"date":datetime.datetime.now(),
#”output”: ['1 aksdfjas;dkfjsa;dfkj','2 ;alksdjf;askdjf;asdfjkasdf', '3 ;alksdfj;asdlkfj;asdfj',]
“output”: [tGenerator(createLoop())]
}
collection2.insert_one(post)

First, change how you are constructing the string from tGenerator method to below:
returnString += str(i+1) + " " + stringToStuff + ","
Second, you can use the split method to do the required, so your insertion will look something like below:
post = {
"name":name2,
"score":8,
"date":datetime.datetime.now(),
"output": tGenerator(createLoop()).split(',')
}
collection2.insert_one(post)
I hope, the above works for you.

Related

How to build tuple array variable of integers + strings in python for loop?

Manually built variable STAFFLIST works in django database as a list of choices listed as PositiveSmallIntegerField.
STAFF1 = 1
STAFF2 = 2
STAFF3 = 3
STAFFLIST = (
(STAFF1, _('John Doe')),
(STAFF2, _('Lisa Beauty')),
(STAFF3, _('Harry Potter')),
)
Print for this variable works as follows:
>>>print(STAFFLIST[1])
(2, 'Lisa Beauty')
>>> print(STAFFLIST[1][0])
2
>>> print(STAFFLIST[1][1])
Lisa Beauty
>>>
How to built this variable automatically in for loop based on the actual list of staffs? I was trying something like below, but unsuccsessful:
from django.contrib.auth.models import User
from django.utils.translation import gettext as _
staff = User.objects.filter(is_staff=True)
idx=0
for stf in staff:
STAFFLIST[idx] = (stf.id, _(stf.first_name + ' ' + stf.last_name))
idx=idx+1
I am getting error like:
TypeError: 'str' object does not support item assignment
I believe, my problem is low knowledge level of python datatypes. And currently, I am trying to work with tuple datatype and to store string inside.
I have solved the for loop problem. Folowing code works and builds the same tuple array as the manually defined variable:
from django.contrib.auth.models import User
from django.utils.translation import gettext as _
staff = User.objects.filter(is_staff=True)
idx=0
for stf in staff:
if idx == 0:
STAFFLIST = ((stf.id, _(stf.first_name + ' ' + stf.last_name)),)
idx=1
else:
STAFFLIST = STAFFLIST + ((stf.id, _(stf.first_name + ' ' + stf.last_name)),)

Insert table data from website into table on my own website using Python and Beautiful Soup

I wrote some code that grabs the numbers I need from this website, but I don't know what to do next.
It grabs the numbers from the table at the bottom. The ones under calving ease, birth weight, weaning weight, yearling weight, milk and total maternal.
#!/usr/bin/python
import urllib2
from bs4 import BeautifulSoup
import pyperclip
def getPageData(url):
if not ('abri.une.edu.au' in url):
return -1
webpage = urllib2.urlopen(url).read()
soup = BeautifulSoup(webpage, "html.parser")
# This finds the epd tree and saves it as a searchable list
pedTreeTable = soup.find('table', {'class':'TablesEBVBox'})
# This puts all of the epds into a list.
# it looks for anything in pedTreeTable with an td tag.
pageData = pedTreeTable.findAll('td')
pageData.pop(7)
return pageData
def createPedigree(animalPageData):
''' make animalPageData much more useful. Strip the text out and put it in a dict.'''
animals = []
for animal in animalPageData:
animals.append(animal.text)
prettyPedigree = {
'calving_ease' : animals[18],
'birth_weight' : animals[19],
'wean_weight' : animals[20],
'year_weight' : animals[21],
'milk' : animals[22],
'total_mat' : animals[23]
}
for animalKey in prettyPedigree:
if animalKey != 'year_weight' and animalKey != 'dam':
prettyPedigree[animalKey] = stripRegNumber(prettyPedigree[animalKey])
return prettyPedigree
def stripRegNumber(animal):
'''returns the animal with its registration number stripped'''
lAnimal = animal.split()
strippedAnimal = ""
for word in lAnimal:
if not word.isdigit():
strippedAnimal += word + " "
return strippedAnimal
def prettify(pedigree):
''' Takes the pedigree and prints it out in a usable format '''
s = ''
pedString = ""
# this is also ugly, but it was the only way I found to format with a variable
cFormat = '{{:^{}}}'
rFormat = '{{:>{}}}'
#row 1 of string
s += rFormat.format(len(pedigree['calving_ease'])).format(
pedigree['calving_ease']) + '\n'
#row 2 of string
s += rFormat.format(len(pedigree['birth_weight'])).format(
pedigree['birth_weight']) + '\n'
#row 3 of string
s += rFormat.format(len(pedigree['wean_weight'])).format(
pedigree['wean_weight']) + '\n'
#row 4 of string
s += rFormat.format(len(pedigree['year_weight'])).format(
pedigree['year_weight']) + '\n'
#row 4 of string
s += rFormat.format(len(pedigree['milk'])).format(
pedigree['milk']) + '\n'
#row 5 of string
s += rFormat.format(len(pedigree['total_mat'])).format(
pedigree['total_mat']) + '\n'
return s
if __name__ == '__main__':
while True:
url = raw_input('Input a url you want to use to make life easier: \n')
pageData = getPageData(url)
s = prettify(createPedigree(pageData))
pyperclip.copy(s)
if len(s) > 0:
print 'the easy string has been copied to your clipboard'
I've just been using this code for easy copying and pasting. All I have to do is insert the URL, and it saves the numbers to my clipboard.
Now I want to use this code on my website; I want to be able to insert a URL in my HTML code, and it displays these numbers on my page in a table.
My questions are as follows:
How do I use the python code on the website?
How do I insert collected data into a table with HTML?
It sounds like you would want to use something like Django. Although the learning curve is a bit steep, it is worth it and it (of course) supports python.

Creating new table while iterating through a queryset in django

This is a newbie question, but despite reading https://docs.djangoproject.com/en/dev/ref/models/instances/#saving-objects , I'm not quite sure how to do this. I have an existing table where I would like to iterate through all its records, and save certain info to a second table. I have the following model:
class myEmails(models.Model):
text = models.CharField(max_length=1200)
In my view I have:
def getMyMessages(request):
from django_mailbox.models import Message
from get_new_emails.models import myEmails
import re
qs = Message.objects.all()
count = 0
output = ""
for i in qs:
count += 1
output = output + str(count) + " TEXT: " + i.text + '<br>' + '<br>'
return HttpResponse(output)
How can I modify my view to save "i.text" to the text field of the 'myEmails' table
You can create new objects and save them to the database afterwards using save():
for i in qs:
obj = myEmails(text=i.text)
obj.save()

Reduce RAM usage in Python script

I've written a quick little program to scrape book data off of a UNESCO website which contains information about book translations. The code is doing what I want it to, but by the time it's processed about 20 countries, it's using ~6GB of RAM. Since there are around 200 I need to process, this isn't going to work for me.
I'm not sure where all the RAM usage is coming from, so I'm not sure how to reduce it. I'm assuming that it's the dictionary that's holding all the book information, but I'm not positive. I'm not sure if I should simply make the program run once for each country, rather than processing the lot of them? Or if there's a better way to do it?
This is the first time I've written anything like this, and I'm a pretty novice, self-taught programmer, so please point out any significant flaws in the code, or improvement tips you have that may not directly relate to the question at hand.
This is my code, thanks in advance for any assistance.
from __future__ import print_function
import urllib2, os
from bs4 import BeautifulSoup, SoupStrainer
''' Set list of countries and their code for niceness in explaining what
is actually going on as the program runs. '''
countries = {"AFG":"Afghanistan","ALA":"Aland Islands","DZA":"Algeria"}
'''List of country codes since dictionaries aren't sorted in any
way, this makes processing easier to deal with if it fails at
some point, mid run.'''
country_code_list = ["AFG","ALA","DZA"]
base_url = "http://www.unesco.org/xtrans/bsresult.aspx?lg=0&c="
destination_directory = "/Users/robbie/Test/"
only_restable = SoupStrainer(class_="restable")
class Book(object):
def set_author(self,book):
'''Parse the webpage to find author names. Finds last name, then
first name of original author(s) and sets the Book object's
Author attribute to the resulting string.'''
authors = ""
author_last_names = book.find_all('span',class_="sn_auth_name")
author_first_names = book.find_all('span', attrs={\
'class':"sn_auth_first_name"})
if author_last_names == []: self.Author = [" "]
for author in author_last_names:
try:
first_name = author_first_names.pop()
authors = authors + author.getText() + ', ' + \
first_name.getText()
except IndexError:
authors = authors + (author.getText())
self.author = authors
def set_quality(self,book):
''' Check to see if book page is using Quality, then set it if
so.'''
quality = book.find_all('span', class_="sn_auth_quality")
if len(quality) == 0: self.quality = " "
else: self.quality = quality[0].contents[0]
def set_target_title(self,book):
target_title = book.find_all('span', class_="sn_target_title")
if len(target_title) == 0: self.target_title = " "
else: self.target_title = target_title[0].contents[0]
def set_target_language(self,book):
target_language = book.find_all('span', class_="sn_target_lang")
if len(target_language) == 0: self.target_language = " "
else: self.target_language = target_language[0].contents[0]
def set_translator_name(self,book) :
translators = ""
translator_last_names = book.find_all('span', class_="sn_transl_name")
translator_first_names = book.find_all('span', \
class_="sn_transl_first_name")
if translator_first_names == [] and translator_last_names == [] :
self.translators = " "
return None
for translator in translator_last_names:
try:
first_name = translator_first_names.pop()
translators = translators + \
(translator.getText() + ',' \
+ first_name.getText())
except IndexError:
translators = translators + \
(translator.getText())
self.translators = translators
def set_published_city(self,book) :
published_city = book.find_all('span', class_="place")
if len(published_city) == 0:
self.published_city = " "
else: self.published_city = published_city[0].contents[0]
def set_publisher(self,book) :
publisher = book.find_all('span', class_="place")
if len(publisher) == 0:
self.publisher = " "
else: self.publisher = publisher[0].contents[0]
def set_published_country(self,book) :
published_country = book.find_all('span', \
class_="sn_country")
if len(published_country) == 0:
self.published_country = " "
else: self.published_country = published_country[0].contents[0]
def set_year(self,book) :
year = book.find_all('span', class_="sn_year")
if len(year) == 0:
self.year = " "
else: self.year = year[0].contents[0]
def set_pages(self,book) :
pages = book.find_all('span', class_="sn_pagination")
if len(pages) == 0:
self.pages = " "
else: self.pages = pages[0].contents[0]
def set_edition(self, book) :
edition = book.find_all('span', class_="sn_editionstat")
if len(edition) == 0:
self.edition = " "
else: self.edition = edition[0].contents[0]
def set_original_title(self,book) :
original_title = book.find_all('span', class_="sn_orig_title")
if len(original_title) == 0:
self.original_title = " "
else: self.original_title = original_title[0].contents[0]
def set_original_language(self,book) :
languages = ''
original_languages = book.find_all('span', \
class_="sn_orig_lang")
for language in original_languages:
languages = languages + language.getText() + ', '
self.original_languages = languages
def export(self, country):
''' Function to allow us to easilly pull the text from the
contents of the Book object's attributes and write them to the
country in which the book was published's CSV file.'''
file_name = os.path.join(destination_directory + country + ".csv")
with open(file_name, "a") as by_country_csv:
print(self.author.encode('UTF-8') + " & " + \
self.quality.encode('UTF-8') + " & " + \
self.target_title.encode('UTF-8') + " & " + \
self.target_language.encode('UTF-8') + " & " + \
self.translators.encode('UTF-8') + " & " + \
self.published_city.encode('UTF-8') + " & " + \
self.publisher.encode('UTF-8') + " & " + \
self.published_country.encode('UTF-8') + " & " + \
self.year.encode('UTF-8') + " & " + \
self.pages.encode('UTF-8') + " & " + \
self.edition.encode('UTF-8') + " & " + \
self.original_title.encode('UTF-8') + " & " + \
self.original_languages.encode('UTF-8'), file=by_country_csv)
by_country_csv.close()
def __init__(self, book, country):
''' Initialize the Book object by feeding it the HTML for its
row'''
self.set_author(book)
self.set_quality(book)
self.set_target_title(book)
self.set_target_language(book)
self.set_translator_name(book)
self.set_published_city(book)
self.set_publisher(book)
self.set_published_country(book)
self.set_year(book)
self.set_pages(book)
self.set_edition(book)
self.set_original_title(book)
self.set_original_language(book)
def get_all_pages(country,base_url):
''' Create a list of URLs to be crawled by adding the ISO_3166-1_alpha-3
country code to the URL and then iterating through the results every 10
pages. Returns a string.'''
base_page = urllib2.urlopen(base_url+country)
page = BeautifulSoup(base_page, parse_only=only_restable)
result_number = page.find_all('td',class_="res1",limit=1)
if not result_number:
return 0
str_result_number = str(result_number[0].getText())
results_total = int(str_result_number.split('/')[1])
page.decompose()
return results_total
def build_list(country_code_list, countries):
''' Build the list of all the books, and return a list of Book objects
in case you want to do something with them in something else, ever.'''
for country in country_code_list:
print("Processing %s now..." % countries[country])
results_total = get_all_pages(country, base_url)
for url in range(results_total):
if url % 10 == 0 :
all_books = []
target_page = urllib2.urlopen(base_url + country \
+"&fr="+str(url))
page = BeautifulSoup(target_page, parse_only=only_restable)
books = page.find_all('td',class_="res2")
for book in books:
all_books.append(Book (book,country))
page.decompose()
for title in all_books:
title.export(country)
return
if __name__ == "__main__":
build_list(country_code_list,countries)
print("Completed.")
I guess I'll just list off some of the problems or possible improvements in no particular order:
Follow PEP 8.
Right now, you've got lots of variables and functions named using camel-case like setAuthor. That's not the conventional style for Python; Python would typically named that set_author (and published_country rather than PublishedCountry, etc.). You can even change the names of some of the things you're calling: for one, BeautifulSoup supports findAll for compatibility, but find_all is recommended.
Besides naming, PEP 8 also specifies a few other things; for example, you'd want to rewrite this:
if len(resultNumber) == 0 : return 0
as this:
if len(result_number) == 0:
return 0
or even taking into account the fact that empty lists are falsy:
if not result_number:
return 0
Pass a SoupStrainer to BeautifulSoup.
The information you're looking for is probably in only part of the document; you don't need to parse the whole thing into a tree. Pass a SoupStrainer as the parse_only argument to BeautifulSoup. This should reduce memory usage by discarding unnecessary parts early.
decompose the soup when you're done with it.
Python primarily uses reference counting, so removing all circular references (as decompose does) should let its primary mechanism for garbage collection, reference counting, free up a lot of memory. Python also has a semi-traditional garbage collector to deal with circular references, but reference counting is much faster.
Don't make Book.__init__ write things to disk.
In most cases, I wouldn't expect just creating an instance of a class to write something to disk. Remove the call to export; let the user call export if they want it to be put on the disk.
Stop holding on to so much data in memory.
You're accumulating all this data into a dictionary just to export it afterwards. The obvious thing to do to reduce memory is to dump it to disk as soon as possible. Your comment indicates that you're putting it in a dictionary to be flexible; but that doesn't mean you have to collect it all in a list: use a generator, yielding items as you scrape them. Then the user can iterate over it just like a list:
for book in scrape_books():
book.export()
…but with the advantage that at most one book will be kept in memory at a time.
Use the functions in os.path rather than munging paths yourself.
Your code right now is rather fragile when it comes to path names. If I accidentally removed the trailing slash from destinationDirectory, something unintended happens. Using os.path.join prevents that from happening and deals with cross-platform differences:
>>> os.path.join("/Users/robbie/Test/", "USA")
'/Users/robbie/Test/USA'
>>> os.path.join("/Users/robbie/Test", "USA") # still works!
'/Users/robbie/Test/USA'
>>> # or say we were on Windows:
>>> os.path.join(r"C:\Documents and Settings\robbie\Test", "USA")
'C:\\Documents and Settings\\robbie\\Test\\USA'
Abbreviate attrs={"class":...} to class_=....
BeautifulSoup 4.1.2 introduces searching with class_, which removes the need for the verbose attrs={"class":...}.
I imagine there are even more things you can change, but that's quite a few to start with.
What do you want the booklist for, in the end? You should export each book at the end of the "for url in range" block (inside it), and do without the allbooks dict. If you really need a list, define exactly what infos you will need, not keeping full Book objects.

What is the right method of escaping string before using it as json object

I have to create JSON string from database values and push it back to database again. My Python code is:
json = "{"
for row in cursor_mysql:
#mainkey = row[0]
#name = row[1]
#value = row[2]
mainkey = """" " \n \ / """ #for testing only
name = """ {} " \r \t """ #for testing only
value = """ ' " \ & """ #for testing only
json += """"%s":{"name":"%s","value":"%s"},""" % (re.escape(mainkey), re.escape(name), re.escape(value))
json = json[:-1]
json += "}"
#print json
query = """UPDATE table SET json = '%s' WHERE id = '%d' RETURNING id""" % (json, rowId)
cursor_postgres.execute(query)
conn_postgres.commit()
insertId = cursor_postgres.fetchone()[0]
This code works great when there are no malicious characters around. However, it doesn't work when sprinkled with non-alphanumeric values, as in the test cases above.
The bad JSON making it to my db is:
{
"""
\ / ": {
"name": " {} "","value":"'" "
},
"""
\ / ": {
"name": " {} "","value":"'" "
}
}
How to sanitize the string, so that when deserialized the json output is same as input?
import json
data = json.dumps(BIG_STRUCTURE_GOES_HERE)
query = """UPDATE table SET json = %s WHERE id = %s RETURNING id"""
cursor_postgres.execute(query, (data, rowId))
conn_postgres.commit()
http://docs.python.org/library/json.html
http://pypi.python.org/pypi/simplejson/
django.utils.simplejson
Simply use the json library:
import json
mainkey = """" " \n \ / """ #for testing only
name = """ {} " \r \t """ #for testing only
value = """ ' " \ & """ #for testing only
d = {mainkey: {"name": name, "value": value}}
jsonValue = json.dumps(d)

Categories