So I have been using youtube api to scrape a channel. Everything was working fine until 3 days ago (03/15/2019) when the result isn't sorted anymore. It seems that no matter what I put in the order parameter, the results are all the same. Can anyone tell me why it isn't working? Here's the code snippet:
import re
import os
import json
import MySQLdb
from pytube import YouTube
import urllib
import isodate
import sys
def get_all_video_in_channel(channel_id):
api_key = '<MY KEY>'
video_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails&id={}&key={}'
first_url = 'https://www.googleapis.com/youtube/v3/search?key={}&channelId={}&part=snippet,id&order=date&maxResults=50'.format(api_key, channel_id) #order by date but won't work
res = []
url = first_url
while True:
inp = urllib.urlopen(url)
resp = json.load(inp)
vidIds = []
for jobject in resp['items']:
if jobject['id']['kind'] == "youtube#video":
vidIds.append(jobject['id']['videoId'])
vidreq = urllib.urlopen(video_url.format(",".join(vidIds),api_key))
vidres = json.load(vidreq)
for vidjson in vidres['items']:
res.append(vidjson)
if (len(res) >= 50):
break
try:
next_page_token = resp['nextPageToken']
url = first_url + '&pageToken={}'.format(next_page_token)
except:
break
return res
c_id = 'UCycyxZMoPwg9cuRDMyQE7PQ'
episodes = get_all_video_in_channel(c_id)
Edit: I did some more research and people say that the API indeed is not working properly due to Youtube doing something with deleting the New Zealand shooting video and it will soon be working properly again.
I recommend you to see the answer https://stackoverflow.com/a/55220182/8327971. This is a known and acknowledged issue by Google: https://issuetracker.google.com/issues/128673552.
Related
I currently have a script pulling data from Instagram that looks like the code block posted below. As long as you enter your plug in your Instagram credentials under user_name and password, it should be fully reproducible.
It is taking the account listed in player_df, pulling a list of all their followers on Instagram, and taking that list of followers and pulling all of their bio information. But when I run it, I get the following error:
ClientConnectionError: timeout The read operation timed out
You can find the entire error log here, I just didn't want to post it in the original question because it would exceed the character limit.
As an attempt to fix this, I added in the sleep(300) functions to lessen the stress between API calls, but that doesn't seem to do the trick. What would be the best way to get around this so it doesn't timeout while trying to run?
from ftplib import error_proto
from hashlib import new
from multiprocessing.spawn import import_main_path
from time import sleep
from instagram_private_api import Client, ClientCompatPatch
from operator import itemgetter
import pandas as pd
import json
import requests
from collections import Counter
import datetime
import os.path
user_name = "XXXXX"
password = "XXXXX"
players = [['hannahkshepherd', '201683404']]
player_df = pd.DataFrame(players, columns=['username', 'userId'])
def pull_followers(username_instagram, userid_instagram):
followers = []
combinacao = []
results = api.user_followers(userid_instagram, rank_token=api.generate_uuid())
followers.extend(results.get('users', []))
next_max_id = results.get('next_max_id')
while next_max_id:
results = api.user_followers(userid_instagram, rank_token=api.generate_uuid(), max_id=next_max_id)
followers.extend(results.get('users', []))
next_max_id = results.get('next_max_id')
userid = [followers[i]['pk'] for i in range(0,len(followers))]
full_names = [followers[i]['full_name'] for i in range(0,len(followers))]
usernames = [followers[i]['username'] for i in range(0,len(followers))]
profile_pic_url = [followers[i]['profile_pic_url'] for i in range(0,len(followers))]
followers_text = ['follower' for i in range(0,len(followers))]
following_username = [str(username_instagram) for i in range(0,len(followers))]
following_userid = [str(userid_instagram) for i in range(0,len(followers))]
combinacao.extend([list(i) for i in zip(userid, full_names,
usernames, profile_pic_url, followers_text,
following_username, following_userid)])
combinacao = sorted(combinacao, key=itemgetter(2), reverse=False)
return combinacao
all_followers = []
for i in range(len(player_df)):
all_followers += pull_followers(player_df['username'][i], player_df["userId"][i])
def get_bios(followers):
bios = []
for follower in followers:
follower_id = follower[0]
bios += [[follower_id, api.user_info(follower_id)['user']['biography']]]
return bios
#sleep(300)
bios = get_bios(all_followers)
#sleep(300)
def print_bios():
s = ''
for row in bios:
s += '\n' + 'user_id: ' + str(row[0]) + ', bio: ' + str(row[1])
print(s)
Below is my try to create a username availability checker with proxies, so far it works as intended
the only thing is that its slow, i tried to implement threads but no different as im not sure if im doing it right or not.
used concurrent.futures and threading libraries.
Is there a better way to code this kind of programs or are there any other suggestions?
Thanks in advance
import requests
import json
import ctypes
import colorama
from colorama import Fore
from datetime import datetime
import os
os.system("cls")
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
colorama.init()
url = "https://link"
def grab_proxies():
proxylist = []
prx = open('proxy.txt','r')
prx = prx.readlines()
for proxy in prx:
proxy = proxy.rstrip("\n")
proxylist.append(proxy)
return proxylist
prlist = grab_proxies()
def grab_usernames():
userlist = []
users = open('userlist.txt','r')
users = users.readlines()
for user in users:
user = user.rstrip("\n")
userlist.append(user)
return userlist
ulist = grab_usernames()
found = 0
pc = 0
uc = 0
for i in range(0,len(prlist)):
ctypes.windll.kernel32.SetConsoleTitleW(f"[# Checker] | Counter: %s - Found: %s - Current Proxy: %s - Started at: %s" % (i, found, prlist[pc], current_time))
try:
req = requests.post(url,headers=headers, data = {"requested_username": ulist[uc], "xsrf_token": "F0kpyvjJgeBtsOk5Gl6Jvg"},proxies={'http' : prlist[pc],'https': prlist[pc]}, timeout=2)
response = req.json()
#print(response,req.status_code)
#print(response)
#print(type(response))
if(response['reference']['status_code'] == 'TAKEN'):
#rd = response['errors']['username'][0]['code']
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTRED_EX}Taken{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
#print(ulist[uc]+" Taken")
uc+=1
elif(response['reference']['status_code'] == 'OK'):
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTGREEN_EX}Available{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
#print(ulist[uc]+" Available")
f = open("found.txt","a")
f.write(ulist[uc]+"\n")
f.close()
found+=1
uc+=1
elif(response['reference']['status_code'] == 'INVALID_BEGIN'):
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTRED_EX}Invalid Username{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
uc+=1
elif(response['reference']['status_code'] == 'DELETED'):
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTRED_EX}Deleted{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
uc+=1
else:
print(response)
except:
#print(prlist[pc]+ " Going to next proxy")
pc+=1
pass
#break
x = input("Finished!.. press enter to exit")
You could use https://github.com/encode/requests-async to do your requests in an async way
I am new to categories in youtube api in Python and I'm encountering issues regarding the language of the videos after I select the region to find videos in.
The problem that I am encountering is that when I enter a region code, it comments on videos that aren't in English even though it's meant to be.
E.G: I enter the region code 'US' and the outcome is what I have attached. It comments on videos that are in a different language.
result [![enter image description here][1]][1]
I have tried to change the region_code="US" in the script but it has problems with 'US' not being defined.
Does anyone know how I can get around this problem or what I'm doing wrong? Thanks
API_KEY = "key"
# This function loads the comments in Comments.txt file
def load_comments(file):
comments = []
f = open(file, 'r', encoding='utf8')
for comment in f:
comments.append(comment)
return comments
def search_video(keyword, region_code, comments):
# Fucntion from Library
from apiclient.discovery import build
import datetime
import time
def get_category_id(youtube, cat):
req = youtube.videoCategories().list(part='snippet', regionCode=region_code)
response = req.execute()
items_list = response["items"]
for item in items_list:
video_category_id = item.get("id")
if video_category_id is not None:
return video_category_id
def search(youtube, video_category_id=None):
if video_category_id is None:
video_category_id = get_category_id(youtube, keyword)
req = youtube.search().list(videoCategoryId=video_category_id, order='date', maxResults=len(comments), # q=keyword,
publishedAfter=(datetime.datetime.utcnow() +
datetime.timedelta(minutes = -1)).isoformat('T')+'Z',
part='snippet',type='video')
return req, video_category_id
api_key = API_KEY
youtube = build('youtube', 'v3', developerKey=api_key)
req, video_category_id = search(youtube)
res = req.execute()
while len(res['items']) == 0:
time.sleep(10)
req, _ = search(youtube, video_category_id) # re-use category id if already found to prevent lag
res = req.execute()
videoid = [res['items'][i]['id']['videoId'] for i in range(len(res['items']))]
return videoid
(It asks what region to use btw)
I'm programing a program for downloading images from internet and I would like to speed it up using multiple requests at once.
So I wrote a code you can see here at GitHub.
I can request for webpage only like this:
def myrequest(url):
worked = False
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
while not worked:
try:
webpage_read = urlopen(req).read()
worked = True
except:
print("failed to connect to \n{}".format(url))
return(webpage_read)
url = "http://www.mangahere.co/manga/mysterious_girlfriend_x"
webpage_read = myrequest(url).decode("utf-8")
The while is here because I definitely want to download every single picture, so I'm trying until it work (nothing can go wrong except urllib.error.HTTPError: HTTP Error 504: Gateway Time-out)
My question is, how to run that multiple times at once?
My idea is to have " a comander" which will run 5 (or 85) pythonic scripts, give each url and get webpage from them once they are finished, but this is definitely a silly solution :)
EDIT:
I used _thread but it doesn't seem to speed up the program. That should have been the solution am I doing it wrong? that is my new question.
You can use link do get to my code on GitHub
def thrue_thread_download_pics(path, url, ep, name):
lock.acquire()
global goal
goal += 1
lock.release()
webpage_read = myrequest("{}/{}.html".format(url, ep))
url_to_pic = webpage_read.decode("utf-8").split('" onerror="')[0].split('<img src="')[-1]
pic = myrequest(url_to_pic)
myfile = open("{}/pics/{}.jpg".format(path, name), "wb")
myfile.write(pic)
myfile.close()
global finished
finished += 1
and I'm using it here:
for url_ep in urls_eps:
url, maxep = url_ep.split()
maxep = int(maxep)
chap = url.split("/")[-1][2:]
if "." in chap:
chap = chap.replace(".", "")
else:
chap = "{}0".format(chap)
for ep in range(1, maxep + 1):
ted = time.time()
name = "{}{}".format(chap, "{}{}".format((2 - len(str(ep))) * "0", ep))
if name in downloaded:
continue
_thread.start_new_thread(thrue_thread_download_pics, (path, url, ep, name))
checker = -1
while finished != goal:
if finished != checker:
checker = finished
print("{} of {} downloaded".format(finished, goal))
time.sleep(0.1)
Requests Futures is built on top of the very popular requests library and uses non-blocking IO:
from requests_futures.sessions import FuturesSession
session = FuturesSession()
# These requests will run at the same time
future_one = session.get('http://httpbin.org/get')
future_two = session.get('http://httpbin.org/get?foo=bar')
# Get the first result
response_one = future_one.result()
print(response_one.status_code)
print(response_one.text)
# Get the second result
response_two = future_two.result()
print(response_two.status_code)
print(response_two.text)
As of right now I have a majority of the code done for browsing a subreddit, and downloading the top images at the time of the request. I was able to do this using PRAW and urllib to download the images once i get their link. The final part that i am stuck on is putting the images files in an array and actually setting them as my background. Here is what i have
import praw
import time
import os
import urllib as ul
import os
def backGroundChanger(sub):
USER_AGENT='wall paper changer for linux/windows by /u/**********' #specifies what my bot does and by who
REDDIT_ID= #reddit id
REDDIT_PASS= #reddit password
reddit=praw.Reddit(USER_AGENT) #creates bot
reddit.login(REDDIT_ID,REDDIT_PASS) #logsin
print reddit.is_logged_in()
images=reddit.get_subreddit(sub)
while True:
count=0
for sub in images.get_hot(limit=10):
imageLink=sub.url
print imageLink
n=str(count)
ul.urlretrieve(imageLink, "i" + n )
count+=1
file=[]
dir=os.getcwd()
for files in os.listdir("."):
if(files.endswith(".jpg|| .png"): # not sure if this will work
file.append(files)
changeBackGround(file,dir)
def changeBackGround(file, dir):
#Do back ground changing stuff here
def main():
subreddit=input("What subreddit would you like me to pull images from? ")
print "You chose " + subreddit
backGroundChanger(subreddit)
main()
This might work, maybe not; its untested.
Read up on the os.system function for a means to use system programs to set the background, like xsetbg in linux. Look here for setting the windows background (it only involves hacking the registry).
import os
import glob
import random
import sys
import time
import urllib
import praw
def backGroundChanger(sub):
USER_AGENT = 'wall paper changer for linux/windows by /u/**********' #specifies what my bot does and by who
REDDIT_ID = #reddit id
REDDIT_PASS = #reddit password
reddit = praw.Reddit(USER_AGENT) #creates bot
reddit.login(REDDIT_ID, REDDIT_PASS) #logsin
print reddit.is_logged_in()
images = reddit.get_subreddit(sub)
while True:
count = 0
for sub in images.get_hot(limit = 10):
imageLink = sub.url
print imageLink
n = str(count)
urllib.urlretrieve(imageLink, "i" + n )
count += 1
files = glob.glob("*.jpg") + glob.glob("*.png")
changeBackGround(files)
def changeBackGround(ifiles):
#Do back ground changing stuff here
the_file = ifiles[random.randint(0, len(ifiles) - 1)]
if(sys.platform.startswith("win")): # Windows
# Do this yourself
pass
elif(sys.platform.startswith("linux")): # Linux
os.system("xsetbg -center %s" % the_file)
def main():
subreddit = input("What subreddit would you like me to pull images from? ")
print "You chose " + subreddit
backGroundChanger(subreddit)
main()