I am trying to extract some YouTube videos with details, and when I created a dataframe from my dictionary, I faced this error. so can someone help me??
def youtube_search(q, max_results=50,order="relevance", token=None, location=None, location_radius=None):
search_response = youtube.search().list(
q=q,
type="video",
pageToken=token,
order = order,
part="id,snippet", # Part signifies the different types of data you want
maxResults=max_results,
location=location,
locationRadius=location_radius).execute()
title = []
channelId = []
channelTitle = []
categoryId = []
videoId = []
viewCount = []
likeCount = []
dislikeCount = []
commentCount = []
category = []
tags = []
videos = []
for search_result in search_response.get("items", []):
if search_result["id"]["kind"] == "youtube#video":
title.append(search_result['snippet']['title'])
videoId.append(search_result['id']['videoId'])
response = youtube.videos().list(
part='statistics, snippet',
id=search_result['id']['videoId']).execute()
channelId.append(response['items'][0]['snippet']['channelId'])
channelTitle.append(response['items'][0]['snippet']['channelTitle'])
categoryId.append(response['items'][0]['snippet']['categoryId'])
viewCount.append(response['items'][0]['statistics']['viewCount'])
likeCount.append(response['items'][0]['statistics']['likeCount'])
dislikeCount.append(response['items'][0]['statistics']['dislikeCount'])
if 'commentCount' in response['items'][0]['statistics'].keys():
commentCount.append(response['items'][0]['statistics']['commentCount'])
else:
commentCount.append([])
if 'tags' in response['items'][0]['snippet'].keys():
tags.append(response['items'][0]['snippet']['tags'])
else:
tags.append([])
#Not every video has likes/dislikes enabled so they won't appear in JSON response
try:
likeCount.append(response['items'][0]['statistics']['likeCount'])
except:
#Good to be aware of Channels that turn off their Likes
print("Video titled {0}, on Channel {1} Likes Count is not available".format(stats['items'][0]['snippet']['title'],
stats['items'][0]['snippet']['channelTitle']))
print(response['items'][0]['statistics'].keys())
#Appends "Not Available" to keep dictionary values aligned
likeCount.append("Not available")
try:
dislikeCount.append(response['items'][0]['statistics']['dislikeCount'])
except:
#Good to be aware of Channels that turn off their Likes
print("Video titled {0}, on Channel {1} Dislikes Count is not available".format(stats['items'][0]['snippet']['title'],
stats['items'][0]['snippet']['channelTitle']))
print(response['items'][0]['statistics'].keys())
dislikeCount.append("Not available")
#youtube_dict = {'tags':tags,'channelId': channelId,'channelTitle': channelTitle,'categoryId':categoryId,'title':title,'videoId':videoId,'viewCount':viewCount,'likeCount':likeCount,'dislikeCount':dislikeCount,'commentCount':commentCount,'favoriteCount':favoriteCount}
youtube_dict = {'tags':tags,'channelTitle': channelTitle,
'title':title,'videoId':videoId,'viewCount':viewCount,
'likeCount':likeCount, 'dislikeCount':dislikeCount, 'commentCount':commentCount, }
return youtube_dict
q = "covid19 vaccine"
test = youtube_search(q, max_results=100,order="relevance", token=None, location=None, location_radius=None)
import pandas as pd
df = pd.DataFrame(data=test)
df.head()
ValueError: arrays must all be same length. I tried to add
df = pd.DataFrame.from_dict(data=test, orient='index'), but I doesn't work too, I faced an other error
TypeError: init() got an unexpected keyword argument 'orient'
Any help would be much appreciated.
Related
Below is a list of twitter handles I am using to scrape tweets
myDict = {}
list = ['ShoePalace', 'StreetWearDealz', 'ClothesUndrCost', 'DealsPlus', 'bodega', 'FRSHSneaks',
'more_sneakers', 'BOOSTLINKS', 'endclothing', 'DopeKixDaily', 'RSVPGallery', 'StealSupply',
'SneakerAlertHD', 'JustFreshKicks', 'solefed', 'SneakerMash', 'StealsBySwell', 'KicksDeals',
'FatKidDeals', 'sneakersteal', 'SOLELINKS', 'SneakerShouts', 'KicksUnderCost', 'snkr_twitr',
'KicksFinder']
In the for loop below I am cycling thru each twitter handle and grabbing data. After the data is pull I am attempting to add the data to the dictionary (myDict). Currently the code is only returning a single value:
{'title': 'Ad: Nike Air Max 97 Golf ‘Grass’ is back in stock at Nikestore!\n\n>>', 'url': 'example.com', 'image': 'image.jpg', 'tweet_url': 'example.com', 'username': 'KicksFinder', 'date': datetime.datetime(2020, 7, 27, 11, 44, 26)}
for i in list:
for tweet in get_tweets(i, pages=1):
tweet_url = 'https://www.twitter.com/' + tweet['tweetUrl']
username = tweet['username']
date = tweet['time']
text = tweet['text']
title = text.split('http')[0]
title = title.strip()
title = title.rstrip()
try:
entries = tweet['entries']
image = entries["photos"][0]
url = entries["urls"][0]
myDict['title'] = title
myDict['url'] = url
myDict['image'] = image
myDict['tweet_url'] = tweet_url
myDict['username'] = username
myDict['date'] = date
except IndexError:
title = title
image = ""
link = ""
return(myDict)
You're mutating a single dict, not adding to a list.
We can refactor your code to a handful of simpler functions that process tweepy? Tweets into dicts and others that yield processed tweet dicts for a given user.
Instead of printing the tweets at the end, you could now list.append them - or even simpler, just tweets = list(process_tweets_for_users(usernames)) :)
def process_tweet(tweet) -> dict:
"""
Turn a Twitter-native Tweet into a dict
"""
tweet_url = "https://www.twitter.com/" + tweet["tweetUrl"]
username = tweet["username"]
date = tweet["time"]
text = tweet["text"]
title = text.split("http")[0]
title = title.strip()
try:
entries = tweet["entries"]
image = entries["photos"][0]
url = entries["urls"][0]
except Exception:
image = url = None
return {
"title": title,
"url": url,
"image": image,
"tweet_url": tweet_url,
"username": username,
"date": date,
}
def process_user_tweets(username: str):
"""
Generate processed tweets for a given user.
"""
for tweet in get_tweets(username, pages=1):
try:
yield process_tweet(tweet)
except Exception as exc:
# TODO: improve error handling
print(exc)
def process_tweets_for_users(usernames):
"""
Generate processed tweets for a number of users.
"""
for username in usernames:
yield from process_user_tweets(username)
usernames = [
"ShoePalace",
"StreetWearDealz",
"ClothesUndrCost",
"DealsPlus",
"bodega",
"FRSHSneaks",
"more_sneakers",
"BOOSTLINKS",
"endclothing",
"DopeKixDaily",
"RSVPGallery",
"StealSupply",
"SneakerAlertHD",
"JustFreshKicks",
"solefed",
"SneakerMash",
"StealsBySwell",
"KicksDeals",
"FatKidDeals",
"sneakersteal",
"SOLELINKS",
"SneakerShouts",
"KicksUnderCost",
"snkr_twitr",
"KicksFinder",
]
for tweet in process_tweets_for_users(usernames):
print(tweet)
It is expected you only get the results for the last value in your lists because you seem to be overwriting the results for each tweet, instead of appending them to a list. I would use defauldict(list) and then append each tweet:
from collections import defaultdict
myDict = defaultdict(list)
for i in list:
for tweet in get_tweets(i, pages=1):
tweet_url = 'https://www.twitter.com/' + tweet['tweetUrl']
username = tweet['username']
date = tweet['time']
text = tweet['text']
title = text.split('http')[0]
title = title.strip()
title = title.rstrip()
try:
entries = tweet['entries']
image = entries["photos"][0]
url = entries["urls"][0]
myDict['title'].append(title)
myDict['url'].append(url)
myDict['image'].append(image)
myDict['tweet_url'].append(tweet_url)
myDict['username'].append(username)
myDict['date'].append(date)
except IndexError:
title = title
image = ""
link = ""
return(myDict)
Now that you have everything nice and tidy, you can put it into a nice dataframe to work with your data:
tweets_df = pd.DataFrame(tweets_df)
I am trying to get a list of all the org ids under organization using boto3. The current structure is like this -
Root
|
|
ou1-----OU2-----OU3
| | |
ou4 ou5 ou6
|
ou7
|
ou8
This structure may change in future more ORG units might get added some of them may be deleted so I would like to make the function dynamic. I was hoping I could provide the Root id after which it should be able to find out all the org id under it. But this seems a little complicated as there is no existing API in boto3 which lists all the ORG ids under root. I would really appreciate if someone could give guidance/suggestion
I have taken a look at -
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/organizations.html#Organizations.Client.list_children
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/organizations.html#Organizations.Client.list_parents
but not sure how to interconnect them so it can find all the org ids, below is the code that I wrote but this will only fetch 2nd layer of children, that is till org4,5 and 6
org = session.client("organizations")
response = org.list_roots()
for PolicyTypes in response["Roots"]:
parent_id = PolicyTypes["Id"]
OUlist = []
NextToken = False
while NextToken is not None:
if not NextToken:
response_iterator = org.list_organizational_units_for_parent(ParentId=parent_id, MaxResults=20)
else:
response_iterator = org.list_organizational_units_for_parent(ParentId=parent_id, MaxResults=20,
NextToken=NextToken)
OUlist = get_OUlist(OUlist, response_iterator)
try:
NextToken = response_iterator['NextToken']
except KeyError:
break
get_child_ou(org, OUlist)
def get_child_ou(org, OUlist):
for ou in OUlist:
NextToken = False
while NextToken is not None:
if not NextToken:
response_iterator = org.list_children(ParentId=ou, ChildType='ORGANIZATIONAL_UNIT', MaxResults=20)
else:
response_iterator = org.list_children(ParentId=ou, ChildType='ORGANIZATIONAL_UNIT', NextToken=NextToken,
MaxResults=20)
try:
NextToken = response_iterator['NextToken']
except KeyError:
break
for orgid in response_iterator["Children"]:
OUlist.append(orgid["Id"])
return OUlist
Simple solution
import boto3
session = boto3.Session(profile_name='default')
org = session.client('organizations')
def printout(parent_id, indent):
print(f"{'-' * indent} {parent_id}")
paginator = org.get_paginator('list_children')
iterator = paginator.paginate(
ParentId=parent_id,
ChildType='ORGANIZATIONAL_UNIT'
)
indent += 1
for page in iterator:
for ou in page['Children']:
printout(ou['Id'], indent)
if __name__ == "__main__":
rootid = org.list_roots()["Roots"][0]["Id"]
printout(rootid, 0)
import boto3
def add_ou(ids):
for id in ids:
ou_list.append(id)
child_ids = get_childs(id)
while child_ids:
if len(child_ids) > 1:
add_ou(child_ids)
child_ids = []
else:
ou_list.append(child_ids[0])
child_ids = get_childs(child_ids[0])
def get_childs(id):
childs = org_client.list_children(
ParentId=id,
ChildType='ORGANIZATIONAL_UNIT')
return [child["Id"] for child in childs["Children"]]
if __name__ == "__main__":
org_client = boto3.client('organizations')
root_id = org_client.list_roots()["Roots"][0]["Id"]
childs = get_childs(root_id)
ou_list = []
add_ou(childs)
print(ou_list)
This will loop through all organization units and print organization units Ids
In addition to #Danish's answer:
You can now use the Paginator feature for organizations.list_children (and many other API calls). This drops the need to check for NextToken, saves LOCs and enhances code readability :-)
# Lambda example
import boto3
client = boto3.client('organizations')
def lambda_handler(event, context):
root_id = client.list_roots()['Roots'][0]['Id']
ou_id_list = get_ou_ids(root_id)
print(ou_id_list)
def get_ou_ids(parent_id):
full_result = []
paginator = client.get_paginator('list_children')
iterator = paginator.paginate(
ParentId=parent_id,
ChildType='ORGANIZATIONAL_UNIT'
)
for page in iterator:
for ou in page['Children']:
# 1. Add entry
# 2. Fetch children recursively
full_result.append(ou['Id'])
full_result.extend(get_ou_ids(ou['Id']))
return full_result
I am currently trying to download a large number of NY Times articles using their API, based on Python 2.7. To do so, I was able to reuse a piece of code i found online:
[code]from nytimesarticle import articleAPI
api = articleAPI('...')
articles = api.search( q = 'Brazil',
fq = {'headline':'Brazil', 'source':['Reuters','AP', 'The New York Times']},
begin_date = '20090101' )
def parse_articles(articles):
'''
This function takes in a response to the NYT api and parses
the articles into a list of dictionaries
'''
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
# locations
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
# subject
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
'''
This function accepts a year in string format (e.g.'1980')
and a query (e.g.'Amnesty International') and it will
return a list of parsed articles (in dictionaries)
for that year.
'''
all_articles = []
for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
articles = api.search(q = query,
fq = {'headline':'Brazil','source':['Reuters','AP', 'The New York Times']},
begin_date = date + '0101',
end_date = date + '1231',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
Download_all = []
for i in range(2009,2010):
print 'Processing' + str(i) + '...'
Amnesty_year = get_articles(str(i),'Brazil')
Download_all = Download_all + Amnesty_year
import csv
keys = Download_all[0].keys()
with open('brazil-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(Download_all)
Without the last bit (starting with "... import csv" this seems to be working fine. If I simply print my results, ("print Download_all") I can see them, however in a very unstructured way. Running the actual code i however get the message:
File "C:\Users\xxx.yyy\AppData\Local\Continuum\Anaconda2\lib\csv.py", line 148, in _dict_to_list
+ ", ".join([repr(x) for x in wrong_fields]))
ValueError: dict contains fields not in fieldnames: 'abstract'
Since I am quite a newbie at this, I would highly appreciate your help in guiding me how to download the news articles into a csv file in a structured way.
Thanks a lot in advance!
Best regards
Where you have:
keys = Download_all[0].keys()
This takes the column headers for the CSV from the dictionary for the first article. The problem is that the article dictionaries do not all have the same keys, so when you reach the first one that has the extra abstract key, it fails.
It looks like you'll have problems with abstract and snippet which are only added to the dictionary if they exist in the response.
You need to make keys equal to the superset of all possible keys:
keys = Download_all[0].keys() + ['abstract', 'snippet']
Or, ensure that every dict has a value for every field:
def parse_articles(articles):
...
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
else:
dic['abstract'] = ""
...
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
else:
dic['snippet'] = ""
I get a flickrapi.exceptions.FlickrError: Error: 1: Photo not found exception in the code below, at this line:
sizes_element = self.flickr.photos_getSizes(k_id = id)
However, according to me the photo_id that I am passing is correct.
import flickrapi
class FlickrDownloader():
def __init__(self,key,secret):
self.api_key = key
self.secret = secret
self.flickr = flickrapi.FlickrAPI(self.api_key,self.secret)
def getUrls(self):
self.photos = self.flickr
self.urlList = []
#get a list of photos
photo_element = self.flickr.photos_search(tags = 'flowers',per_page = '10')
#get and iterator over elements
photo_iter = photo_element.iter('photo')
self.i = 0
for photo in photo_iter:
#get the photo id
id = photo.attrib['id']
print id
#get the different sizes of photo
sizes_element = self.flickr.photos_getSizes(k_id = id)
#get an interator
sizes_iter = sizes_element.iter('size')
#iterate over the sizes
for size in sizes_iter:
#check if its original size
if size.attrib['label'] == 'Original':
self.urlList.append(size.attrib['source'])
return self.urlList
The flickr.photos.getSizes call doesn't take a k_id parameter. Check the Flickr documentation, it shows that you need to pass the photo ID using the photo_id parameter. You need to get the parameter names right, otherwise Flickr doesn't know what to do with them.
I have a list property
tag_list = db.StringListProperty()
This has been working fine so far, but today when I tried to write a list with 18 items I got the Too many indexed properties for entity: error. I think this is a case of "exploding indexes."
This is my query:
query = Main.all()
query.filter("url =", url)
query.filter("owner =", user)
Reading the documentation my understanding is that this error will be triggered for cases where there are 2000+ items in the list. If this is triggered for 18 items, then, what am I doing wrong and how can I fix this? Thanks.
Update with more code:
query = Main.all()
query.filter("url =", url)
query.filter("owner =", user)
e = query.get()
if e:
e.tag_list = user_tag_list
e.pitch = pitch_original
e.title = title_ascii
e.put()
main_id = e.key().id()
else:
try:
new_item = Main(
url = url,
tag_list = user_tag_list,
pitch = pitch_original,
owner = user,
#title = unicode(title, "utf-8"),
title = title_ascii,
display = True)
#this is where the error occurs in the logs
new_item.put()
And this is the list:
user_tag_list = [u'box', u'jquery', u'working', u'enter', u'initially', u'text', u'showing', u'javascript', u'overflow', u'focus', u'stack', u'field', u'impossible', u'input', u'hidden', u'element', u'toggling', u'toggled']
This is because of exploding indexes.