Appending a python dict from a while loop gives unexpected results - python

The max number of records in my input json is 100 however there is a paging-next link that provides the next 100 records. Below is what I have but it returns a dict with only 100 entries- I know there are more- How should I modify this function to get all the records?
def process_comment_json(comment_json):
post_comment_dict = dict()
next_links = list()
if 'comments' in comment_json.keys():
try:
for y in comment_json['comments']['data']:
post_id = comment_json['id']
commentor_name = y['from']['name']
commentor_id = y['from']['id']
created_time = y['created_time']
message = remove_non_ascii(y['message'])
sentiment = return_sentiment_score(message)
post_comment_dict[commentor_id] = {'commentor_name':commentor_name,\
'created_time':created_time, 'message':message,\
'sentiment':sentiment}
except:
print("malformed data, skipping this comment in round1")
if 'next' in comment_json['comments']['paging']:
print('found_next appending')
next_links.append(comment_json['comments']['paging']['next'])
else:
return post_comment_dict
while next_links:
print("processing next_links")
print("current len of post_comment_dict is:", len(post_comment_dict))
for next_link in next_links:
t = requests.get(next_link)
nl_json = t.json()
next_links.pop()
if "data" in list(nl_json.keys()):
for record in nl_json['data']:
try:
for y in comment_json['comments']['data']:
post_id = comment_json['id']
commentor_name = y['from']['name']
commentor_id = y['from']['id']
created_time = y['created_time']
message = remove_non_ascii(y['message'])
sentiment = return_sentiment_score(message)
post_comment_dict[commentor_id] = {'commentor_name':commentor_name,\
'created_time':created_time, 'message':message,\
'sentiment':sentiment}
except:
print("malformed data, skipping this comment from the next_links list")
if 'next' in comment_json['comments']['paging']:
print('found_next appending')
next_links.append(comment_json['comments']['paging']['next'])
else:
return post_comment_dict

Related

Scraped youtube comments amount and real amount are different

Im new to Python and Im trying to code a commentscraper for youtube with the most important informations, which I put in a JSON-file. But the my amount of comments and replys is not the same as on Youtube. I don't know, where my error is. I recognized, that it doesn't write any data in the files, if there are less than 20 comments, but I don't know, where I have to change something...
Example:
https://youtu.be/Re1m9O7q-9U here I get 102, but it should be 107
https://youtu.be/Q9Y5m1fQ7Fk here I get 423, but it should be 486
https://youtu.be/cMhE5BfmFkM here I get 1315, but it should be 2052
Here is the code:
class YT_Comments:
def __init__(self, api_key):
self.api_key = api_key
self.comment_int = 0
def get_video_comments(self, video_id, limit):
url = f"https://youtube.googleapis.com/youtube/v3/commentThreads?part=replies%2C%20snippet&order=relevance&videoId={video_id}&key={self.api_key}"
vid_comments = []
pc, npt = self._get_comments_per_page(url)
if limit is not None and isinstance(limit, int):
url += f"&maxResults={str(limit)}"
while (npt is not None):
nexturl = url + "&pageToken=" + npt
pc, npt = self._get_comments_per_page(nexturl)
vid_comments.append(pc)
print(self.comment_int)
print(len(vid_comments))
return vid_comments
def _get_comments_per_page(self, url):
json_url = requests.get(url)
data = json.loads(json_url.text)
page_comments = []
if "items" not in data:
return page_comments, None
item_data = data["items"]
nextPageToken = data.get("nextPageToken", None)
for item in tqdm.tqdm(item_data):
try:
kind = item["kind"]
if kind == "youtube#comment" or "youtube#commentThread":
comment_text = item["snippet"]["topLevelComment"]["snippet"]["textOriginal"]
comment_author = item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
author_id = item["snippet"]["topLevelComment"]["snippet"]["authorChannelId"]["value"]
comment_like_count = item["snippet"]["topLevelComment"]["snippet"]["likeCount"]
comment_date = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
comment = {"comment_text" : comment_text,
"comment_author" : comment_author,
"comment_author_id" : author_id,
"comment_like_count" : comment_like_count,
"comment_date" : comment_date}
replies_l = []
self.comment_int += 1
try:
replies = item["replies"]["comments"]
for reply in replies:
reply_txt = reply["snippet"]["textOriginal"]
reply_author = reply["snippet"]["authorDisplayName"]
reply_author_id = reply["snippet"]["authorChannelId"]["value"]
reply_like_count = reply["snippet"]["likeCount"]
reply_date = reply["snippet"]["publishedAt"]
reply_dict = {"text" : reply_txt,
"author" : reply_author,
"author_id" : reply_author_id,
"likes" : reply_like_count,
"date" : reply_date}
replies_l.append(reply_dict)
self.comment_int +=1
except KeyError:
replies_l.append(None)
comment_dict = {
"comment": comment,
"replies": replies_l,
}
page_comments.append(comment_dict)
except KeyError:
print("No Comments")
return page_comments, nextPageToken

How to get big amount of data as fast as possible

I am trying to return an array of constructed objects that are build on top of objects that I retrieve from some url plus another fields that I get from another url.
I have an array that consists of two arrays that each has about 8000 objects...
I have tried to make each object construction as a thread however it still takes a lot of time...
Any solution? Here is my code:
def get_all_players_full_data(ea_players_json):
all = []
ea_players_json = list(ea_players_json.values())
for i in range(len(ea_players_json)):
for player_obj in ea_players_json[i]:
all.append(player_obj)
for player_obj in range(len(all)):
all_data = []
with concurrent.futures.ThreadPoolExecutor(len(all)) as executor:
for player_data in all:
future = executor.submit(build_full_player_data_obj, player_data)
print(future.result())
all_data.append(future.result())
def build_full_player_data_obj(ea_player_data):
if ea_player_data.get("c") is not None:
player_full_name = ea_player_data.get("c")
else:
player_full_name = ea_player_data.get("f") + " " + ea_player_data.get("l")
player_id = ea_player_data.get("id")
# go to futhead to find all cards of that player
futhead_url_player_data = f'{FUTHEAD_PLAYER}{player_full_name}'
details_of_specific_player = json.loads(requests.get(futhead_url_player_data).content)
cards_from_the_same_id = []
for player_in_json_futhead in details_of_specific_player:
if player_in_json_futhead["player_id"] == player_id:
rating = player_in_json_futhead["rating"]
specific_card_id = player_in_json_futhead["def_id"]
revision = player_in_json_futhead["revision_type"]
name = player_in_json_futhead["full_name"]
nation = player_in_json_futhead["nation_name"]
position = player_in_json_futhead["position"]
club = player_in_json_futhead["club_name"]
cards_from_the_same_id.append(Player(specific_card_id, name, rating, revision, nation,
position, club))
return cards_from_the_same_id

List comes back as empty when retrieveing data from website ; Python

I am trying to parse data from a website by inserting the data into a list, but the list comes back empty.
url =("http://www.releasechimps.org/resources/publication/whos-there-md- anderson")
http = urllib3.PoolManager()
r = http.request('Get',url)
soup = BeautifulSoup(r.data,"html.parser")
#print(r.data)
loop = re.findall(r'<td>(.*?)</td>',str(r.data))
#print(str(loop))
newLoop = str(loop)
#print(newLoop)
for x in range(1229):
if "\\n\\t\\t\\t\\t" in loop[x]:
loop[x] = loop[x].replace("\\n\\t\\t\\t\\t","")
list0_v2.append(str(loop[x]))
print(loop[x])
print(str(list0_v2))
Edit: Didn't really have anything else going on, so I made your data format into a nice list of dictionaries. There's a weird <td height="26"> on monkey 111, so I had to change the regex slightly.
Hope this helps you, I did it cause I care about the monkeys man.
import html
import re
import urllib.request
list0_v2 = []
final_list = []
url = "http://www.releasechimps.org/resources/publication/whos-there-md-anderson"
data = urllib.request.urlopen(url).read()
loop = re.findall(r'<td.*?>(.*?)</td>', str(data))
for item in loop:
if "\\n\\t\\t\\t\\t" or "em>" in item:
item = item.replace("\\n\\t\\t\\t\\t", "").replace("<em>", "")\
.replace("</em>", "")
if " " == item:
continue
list0_v2.append(item)
n = 1
while len(list0_v2) != 0:
form = {"n":0, "name":"", "id":"", "gender":"", "birthdate":"", "notes":""}
try:
if list0_v2[5][-1] == '.':
numb, name, ids, gender, birthdate, notes = list0_v2[0:6]
form["notes"] = notes
del(list0_v2[0:6])
else:
raise Exception('foo')
except:
numb, name, ids, gender, birthdate = list0_v2[0:5]
del(list0_v2[0:5])
form["n"] = int(numb)
form["name"] = html.unescape(name)
form["id"] = ids
form["gender"] = gender
form["birthdate"] = birthdate
final_list.append(form)
n += 1
for li in final_list:
print("{:3} {:10} {:10} {:3} {:10} {}".format(li["n"], li["name"], li["id"],\
li["gender"], li["birthdate"], li["notes"]))

create and assign subcategories in revit using python

I have a question for some of you who are familiar with the Revit API and python:
I’ve been using the spring nodes package in dynamo to create a rather large series of freeform objects each in their own family. The way that the FamilyInstance.ByGeometry works, it takes a list of solids and creates a family instance for each using a template family file. The result is quite good. (spring nodes can be found here: https://github.com/dimven/SpringNodes)
However, the drawback is that that now I have roughly 200 separate instances, so to make changes to each is rather painful. I thought at first it would be possible to use dynamo to create a new subcategory and set the solid inside each family instance to this new subcategory. Unfortunately, I realized this is not possible since dynamo cannot be open in two different Revit environments simultaneously (the project I am working in and each instance of the family). This leads me to look to see if I can do this using python.
I have used python in rhino and can get along pretty well, I am still learning the Revit API however. But basically my idea would be to:
1. select a series of family instances in the Revit project environment
2. loop through each instance
3. save it to a specified location
4. create a new subcategory in each family instances (the subcategory would the same for all the selected family instances)
5. select the solid in each in instance
6. set the solid to this newly created subcategory
7. close the family instance and save
My question for you is does this sound like it is achievable based on your knowledge of the Revit API?
Many thanks for your time and advice.
UPDATE:
I've found a section in the revit api that describes what i'm looking to do: http://help.autodesk.com/view/RVT/2015/ENU/?guid=GUID-FBF9B994-ADCB-4679-B50B-2E9A1E09AA48
I've made a first pass at inserting this into the python code of the dynamo node. The rest of the code works fine except for the new section im adding (see below). Please excuse the variables, I am simply keeping with logic of the original author of the code i am hacking:
(Note: the variables come in are in arrays)
#set subcategory
try:
#create new sucategory
fam_subcat = famdoc.Settings.Categories.NewSubcategory(fam_cat, get_Item(subcat1.Name))
#assign the mataterial(fam_mat.Id) to the subcategory
fam_subcat.Material = famdoc.GetElement(fam_mat.Id)
#assign the subcategory to the element (s2)
s2.Subcategory = fam_subcat
except: pass
Any help or advice with this section of code would be much appreciated.
UPDATE:
See full code below for context of the section in question:
#Copyright(c) 2015, Dimitar Venkov
# #5devene, dimitar.ven#gmail.com
import clr
import System
from System.Collections.Generic import *
pf_path = System.Environment.GetFolderPath(System.Environment.SpecialFolder.ProgramFilesX86)
import sys
sys.path.append("%s\IronPython 2.7\Lib" %pf_path)
import traceback
clr.AddReference('ProtoGeometry')
from Autodesk.DesignScript.Geometry import *
clr.AddReference("RevitServices")
import RevitServices
from RevitServices.Persistence import DocumentManager
from RevitServices.Transactions import TransactionManager
doc = DocumentManager.Instance.CurrentDBDocument
app = DocumentManager.Instance.CurrentUIApplication.Application
clr.AddReference("RevitAPI")
from Autodesk.Revit.DB import *
from Autodesk.Revit.DB.Structure import StructuralType
clr.AddReference("RevitNodes")
import Revit
clr.ImportExtensions(Revit.Elements)
clr.ImportExtensions(Revit.GeometryConversion)
def tolist(obj1):
if hasattr(obj1,"__iter__"): return obj1
else: return [obj1]
def output1(l1):
if len(l1) == 1: return l1[0]
else: return l1
def PadLists(lists):
len1 = max([len(l) for l in lists])
for i in xrange(len(lists)):
if len(lists[i]) == len1:
continue
else:
len2 = len1 - len(lists[i])
for j in xrange(len2):
lists[i].append(lists[i][-1])
return lists
class FamOpt1(IFamilyLoadOptions):
def __init__(self):
pass
def OnFamilyFound(self,familyInUse, overwriteParameterValues):
return True
def OnSharedFamilyFound(self,familyInUse, source, overwriteParameterValues):
return True
geom = tolist(IN[0])
fam_path = IN[1]
names = tolist(IN[2])
category = tolist(IN[3])
material = tolist(IN[4])
isVoid = tolist(IN[5])
subcategory = tolist(IN[6])
isRvt2014 = False
if app.VersionName == "Autodesk Revit 2014": isRvt2014 = True
units = doc.GetUnits().GetFormatOptions(UnitType.UT_Length).DisplayUnits
factor = UnitUtils.ConvertToInternalUnits(1,units)
acceptable_views = ["ThreeD", "FloorPlan", "EngineeringPlan", "CeilingPlan", "Elevation", "Section"]
origin = XYZ(0,0,0)
str_typ = StructuralType.NonStructural
def NewForm_background(s1, name1, cat1, isVoid1, mat1, subcat1):
t1 = TransactionManager.Instance
TransactionManager.ForceCloseTransaction(t1)
famdoc = doc.Application.NewFamilyDocument(fam_path)
message = None
temp_path = System.IO.Path.GetTempPath()
sat_path = "%s%s.sat" % (temp_path, name1)
try:
if factor != 1:
s1 = s1.Scale(factor)
sat1 = Geometry.ExportToSAT(s1, sat_path)
satOpt = SATImportOptions()
satOpt.Placement = ImportPlacement.Origin
satOpt.Unit = ImportUnit.Foot
view_fec = FilteredElementCollector(famdoc).OfClass(View)
view1 = None
for v in view_fec:
if str(v.ViewType) in acceptable_views:
view1 = v
break
t1.EnsureInTransaction(famdoc)
satId = famdoc.Import(sat1, satOpt, view1)
opt1 = Options()
opt1.ComputeReferences = True
el1 = famdoc.GetElement(satId)
geom1 = el1.get_Geometry(opt1)
enum = geom1.GetEnumerator()
enum.MoveNext()
geom2 = enum.Current.GetInstanceGeometry()
enum2 = geom2.GetEnumerator()
enum2.MoveNext()
s1 = enum2.Current
famdoc.Delete(satId)
TransactionManager.ForceCloseTransaction(t1)
System.IO.File.Delete(sat_path)
except:
message = traceback.format_exc()
pass
if message == None:
try:
save_path = "%s%s.rfa" % (temp_path, name1)
SaveAsOpt = SaveAsOptions()
SaveAsOpt.OverwriteExistingFile = True
t1.EnsureInTransaction(famdoc)
#set the category
try:
fam_cat = famdoc.Settings.Categories.get_Item(cat1.Name)
famdoc.OwnerFamily.FamilyCategory = fam_cat
except: pass
s2 = FreeFormElement.Create(famdoc,s1)
if isVoid1:
void_par = s2.get_Parameter("Solid/Void")
void_par.Set(1)
void_par2 = famdoc.OwnerFamily.get_Parameter("Cut with Voids When Loaded")
void_par2.Set(1)
else: #voids do not have a material value
try:
mat_fec = FilteredElementCollector(famdoc).OfClass(Material)
for m in mat_fec:
if m.Name == mat1:
fam_mat = m
break
mat_par = s2.get_Parameter("Material")
mat_par.Set(fam_mat.Id)
except: pass
#set subcategory
try:
#create new sucategory
fam_subcat = document.Settings.Categories.NewSubcategory(document.OwnerFamily.FamilyCategory, get_Item(subcat1.Name))
#assign the mataterial(fam_mat.Id) to the subcategory
fam_subcat.Material = famdoc.GetElement(fam_mat.Id)
#assign the subcategory to the element (s2)
s2.Subcategory = fam_subcat
except: pass
TransactionManager.ForceCloseTransaction(t1)
famdoc.SaveAs(save_path, SaveAsOpt)
family1 = famdoc.LoadFamily(doc, FamOpt1())
famdoc.Close(False)
System.IO.File.Delete(save_path)
symbols = family1.Symbols.GetEnumerator()
symbols.MoveNext()
symbol1 = symbols.Current
t1.EnsureInTransaction(doc)
if not symbol1.IsActive: symbol1.Activate()
inst1 = doc.Create.NewFamilyInstance(origin, symbol1, str_typ)
TransactionManager.ForceCloseTransaction(t1)
return inst1.ToDSType(False), family1.ToDSType(False)
except:
message = traceback.format_exc()
return message
else:
return message
def NewForm_background_R16(s1, name1, cat1, isVoid1, mat1, subcat1):
t1 = TransactionManager.Instance
TransactionManager.ForceCloseTransaction(t1)
famdoc = doc.Application.NewFamilyDocument(fam_path)
message = None
temp_path = System.IO.Path.GetTempPath()
sat_path = "%s%s.sat" % (temp_path, name1)
try:
if factor != 1:
s1 = s1.Scale(factor)
sat1 = Geometry.ExportToSAT(s1, sat_path)
satOpt = SATImportOptions()
satOpt.Placement = ImportPlacement.Origin
satOpt.Unit = ImportUnit.Foot
view_fec = FilteredElementCollector(famdoc).OfClass(View)
view1 = None
for v in view_fec:
if str(v.ViewType) in acceptable_views:
view1 = v
break
t1.EnsureInTransaction(famdoc)
satId = famdoc.Import(sat1, satOpt, view1)
opt1 = Options()
opt1.ComputeReferences = True
el1 = famdoc.GetElement(satId)
geom1 = el1.get_Geometry(opt1)
enum = geom1.GetEnumerator()
enum.MoveNext()
geom2 = enum.Current.GetInstanceGeometry()
enum2 = geom2.GetEnumerator()
enum2.MoveNext()
s1 = enum2.Current
famdoc.Delete(satId)
TransactionManager.ForceCloseTransaction(t1)
System.IO.File.Delete(sat_path)
except:
message = traceback.format_exc()
pass
if message == None:
try:
save_path = "%s%s.rfa" % (temp_path, name1)
SaveAsOpt = SaveAsOptions()
SaveAsOpt.OverwriteExistingFile = True
t1.EnsureInTransaction(famdoc)
#set the category
try:
fam_cat = famdoc.Settings.Categories.get_Item(cat1.Name)
famdoc.OwnerFamily.FamilyCategory = fam_cat
except: pass
s2 = FreeFormElement.Create(famdoc,s1)
if isVoid1:
void_par = s2.LookupParameter("Solid/Void")
void_par.Set(1)
void_par2 = famdoc.OwnerFamily.LookupParameter("Cut with Voids When Loaded")
void_par2.Set(1)
else: #voids do not have a material value
try:
mat_fec = FilteredElementCollector(famdoc).OfClass(Material)
for m in mat_fec:
if m.Name == mat1:
fam_mat = m
break
mat_par = s2.LookupParameter("Material")
mat_par.Set(fam_mat.Id)
except: pass
#apply same subcategory code as before
#set subcategory
try:
#create new sucategory
fam_subcat = famdoc.Settings.Categories.NewSubcategory(fam_cat, get_Item(subcat1.Name))
#assign the mataterial(fam_mat.Id) to the subcategory
fam_subcat.Material = famdoc.GetElement(fam_mat.Id)
#assign the subcategory to the element (s2)
s2.Subcategory = fam_subcat
except: pass
TransactionManager.ForceCloseTransaction(t1)
famdoc.SaveAs(save_path, SaveAsOpt)
family1 = famdoc.LoadFamily(doc, FamOpt1())
famdoc.Close(False)
System.IO.File.Delete(save_path)
symbols = family1.GetFamilySymbolIds().GetEnumerator()
symbols.MoveNext()
symbol1 = doc.GetElement(symbols.Current)
t1.EnsureInTransaction(doc)
if not symbol1.IsActive: symbol1.Activate()
inst1 = doc.Create.NewFamilyInstance(origin, symbol1, str_typ)
TransactionManager.ForceCloseTransaction(t1)
return inst1.ToDSType(False), family1.ToDSType(False)
except:
message = traceback.format_exc()
return message
else:
return message
if len(geom) == len(names) == len(category) == len(isVoid) == len(material) == len(subcategory):
if isRvt2014:
OUT = output1(map(NewForm_background, geom, names, category, isVoid, material, subcategory))
else:
OUT = output1(map(NewForm_background_R16, geom, names, category, isVoid, material, subcategory))
elif len(geom) == len(names):
padded = PadLists((geom, category, isVoid, material, subcategory))
p_category = padded[1]
p_isVoid = padded[2]
p_material = padded[3]
p_subcategory = padded [4]
if isRvt2014:
OUT = output1(map(NewForm_background, geom, names, p_category, p_isVoid, p_material, p_subcategory))
else:
OUT = output1(map(NewForm_background_R16, geom, names, p_category, p_isVoid, p_material, subcategory))
else: OUT = "Make sure that each geometry\nobject has a unique family name."
Update:
Was able to get it working:
try:
#create new sucategory
fam_subcat = famdoc.Settings.Categories.NewSubcategory(famdoc.OwnerFamily.FamilyCategory, subcat1)
#assign the mataterial(fam_mat.Id) to the subcategory
#fam_subcat.Material = famdoc.GetElement(fam_mat.Id)
#assign the subcategory to the element (s2)
s2.Subcategory = fam_subcat
except: pass
As I answered on your initial query per email, what you are aiming for sounds perfectly feasible to me in the Revit API. Congratulations on getting as far as you have. Looking at the link to the Revit API help file and developer guide that you cite above, it seems that the code has to be executed in the family document while defining the family. The context in which you are trying to execute it is not clear. Have you used EditFamily to open the family definition document? What context are you executing in?

Python json dictionary to array

I am having some trouble understanding json dictionary and array. I have a script that is scraping information from a website.
models.txt is just a list of model numbers such as
30373
30374
30375
and json_descriptions.txt is a list of the keys I want
sku
price
listprice
issoldout
The code is:
import urllib
import re
import json
modelslist = open("models.txt").read()
modelslist = modelslist.split("\n")
descriptionlist = open("json_descriptions.txt").read()
descriptionlist = descriptionlist.split("\n")
for model in modelslist:
htmltext = urllib.urlopen("http://dx.com/p/GetProductInfoRealTime?skus="+model)
htmltext = json.load(htmltext)
if htmltext['success'] == True:
def get_data(dict_index, key):
return htmltext[u"data"][dict_index][key]
for description in descriptionlist:
info = description, (get_data(0,description))
print info
else:
print "product does not exist"
If I print out info I get:
sku 30373
price 9.10
listprice 17.62
issoldout False
so that means info[0] is:
sku
price
listprice
issoldout
and info[1] is:
30373
9.10
17.62
False
I would like to know if there is a way that I can have this:
loop 1 = ['sku','30373','price','4.90','listprice','0','issoldout','False']
loop 2 = ['sku','30374','price','10.50','listprice','0','issoldout','False']
info[0] = sku info[1] = 30373 info[2] = price info[3] = 4.90 info[4] = listprice info[5] = 0 info[6] = issoldout info[7] = False and then repeat that with a new list for the next loop through.
I have tried using info = json.dumps(info) but that just gives info[0] = [[[[ and info[1] = """" info[2] = spli and so on
Like this?
info = []
for model in modelslist:
htmltext = urllib.urlopen("http://dx.com/p/GetProductInfoRealTime?skus="+model)
htmltext = json.load(htmltext)
if htmltext['success'] == True:
def get_data(dict_index, key):
return htmltext[u"data"][dict_index][key]
for description in descriptionlist:
info.append(description)
info.append(get_data(0,description))
print info

Categories