I have a python scraper main purpose
Read list of postcodes from text to an array
for each postcode in array
search 10 pages
pull out certain content.
i seem to be getting the results like:
page 1
page 2
page 2
page 3
page 3
page 3
page 4
page 4
page 4
page 4
etc
i have tried re arranging the code several times without much look, everything works fine expcept this step
from bs4 import BeautifulSoup
import time
from time import sleep
from datetime import datetime
import requests
import csv
print(" Initializing ...")
print(" Loading Keywords")
with open("pcodes.txt") as pcodes:
postkeys = []
for line in pcodes:
postkeys.append(line.strip())
with open("pcodnum.txt") as pcodnum:
postkeynum = []
for line in pcodnum:
postkeynum.append(line.strip())
print(" Welcome to YellScrape v1.0")
print(" You ar searching yell.com ")
comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ")
pagesnum = 0
listinnum = 0
comloc = " "
f = csv.writer(open(datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w'))
f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website'])
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
data_list = []
for x in postkeys:
print(" Searching " + x + " for " + comtype + " companies")
for y in postkeynum:
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
data_list.append(url)
for item in data_list:
site = requests.get(item, headers=headers)
soup = BeautifulSoup(site.content, 'html.parser')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
listinnum += 1
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.select_one('span.business--telephoneNumber')
if busnumber is None:
busnumber = 'None'
else:
busnumber = busnumber.text
busadd = question.find('span', attrs={"itemprop": "streetAddress"})
if busadd is None:
busadd = 'None'
else:
busadd = busadd.text.replace(',',' ')
buslocal = question.find('span', attrs={"itemprop": "addressLocality"})
if buslocal is None:
buslocal = 'None'
else:
buslocal = buslocal.text
buspost = question.find('span', attrs={"itemprop": "postalCode"})
if buspost is None:
buspost = 'None'
else:
buspost = buspost.text
busweb = question.find('a', attrs={"rel": "nofollow noopener"})
if busweb is None:
busweb = 'None'
else:
busweb = busweb.attrs['href']
print(busweb)
f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb])
pagesnum += 1
print(" Finsihed Page " + str(y) + ". For " + x + " . " + str(listinnum) + " listings so far. Moving To Next Page")
print(" Waiting 30 seconds for security reasons.")
sleep(30)
print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')
Expected result:
finished page 1
finished page 2
finished page 3
etc
It's because you are appending to your data list and then using a for loop to iterate through it after each time it's appending a new link.
So it's going to do requests for page 1, then requests for page 1 and requests for page 2, then page 1, 2 and 3, then page 1, 2, 3, and 4... etc.
So there's 2 ways to fix that. 1) don't append to the data_list and eliminate that all together, or 2) you can append to the data_list FIRST, and then loop through it (so separate the loop where you append to data_list and iterate through data_list.
I choose option 2)
from bs4 import BeautifulSoup
import time
from time import sleep
from datetime import datetime
import requests
import csv
print(" Initializing ...")
print(" Loading Keywords")
with open("C:/pcodes.txt") as pcodes:
postkeys = []
for line in pcodes:
postkeys.append(line.strip())
with open("C:/pcodnum.txt") as pcodnum:
postkeynum = []
for line in pcodnum:
postkeynum.append(line.strip())
print(" Welcome to YellScrape v1.0")
print(" You are searching yell.com ")
comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ")
pagesnum = 0
listinnum = 0
comloc = " "
f = csv.writer(open('C:/'+datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w'))
f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website'])
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
data_list = []
for x in postkeys:
print(" Searching " + x + " for " + comtype + " companies")
for y in postkeynum:
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
data_list.append(url)
# Now that you created a list of the urls, now you can loop through them
for item in data_list:
page = item.split('pageNum=')[-1].split('&')[0]
location = item[-5:]
site = requests.get(item, headers=headers)
soup = BeautifulSoup(site.content, 'html.parser')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
listinnum += 1
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.select_one('span.business--telephoneNumber')
if busnumber is None:
busnumber = 'None'
else:
busnumber = busnumber.text
busadd = question.find('span', attrs={"itemprop": "streetAddress"})
if busadd is None:
busadd = 'None'
else:
busadd = busadd.text.replace(',',' ')
buslocal = question.find('span', attrs={"itemprop": "addressLocality"})
if buslocal is None:
buslocal = 'None'
else:
buslocal = buslocal.text
buspost = question.find('span', attrs={"itemprop": "postalCode"})
if buspost is None:
buspost = 'None'
else:
buspost = buspost.text
busweb = question.find('a', attrs={"rel": "nofollow noopener"})
if busweb is None:
busweb = 'None'
else:
busweb = busweb.attrs['href']
print(busweb)
f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb])
pagesnum += 1
print(" Finished Page " + page + ". For " + location + " . " + str(listinnum) + " listings so far. Moving To Next Page")
if item != data_list[-1]:
print(" Waiting 30 seconds for security reasons.")
sleep(30)
print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')
Initialize pageNum inside for loop:
for x in postkeys:
pageNum = 1
Increment pageNum side for loop and format URL
for item in data_list:
#format website url
url = "https://www.yell.com/ucs/UcsSearchAction.do?keywords={}&pageNum={}&location={}".format(comtype, pageNum, x)
site = requests.get(url, headers=headers)
# check response status code:
if site.status_code != 200:
break
pageNum += 1
You should remove this for loop:
for y in postkeynum:
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
data_list.append(url)
Related
When I try to crawl thesis information in multiple threads, I cannot close the process after getting the information:
error
And when I comment the code which function is get the information from network, these processes can end normally.
normal
This error is trouble me and I don't have any idea, my network connect is by requests and set the response.close()
so can any handsome brother or beautiful lady help this confused person? Thanks
This is whole code:
my python is python 3.7
from multiprocessing import Process, Queue, Pool,Manager,Value
import time, random
import requests
import re
from bs4 import BeautifulSoup
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
'Connection': 'close'
}
## Just get the html text
def GetUrlInfo(url):
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
response.close()
SoupData = BeautifulSoup(response.text, 'lxml')
return SoupData
def GetVolumeUrlfromUrl(url:str)->str:
"""input is Journal's url and output is a link and a text description to each issue of the journal"""
url = re.sub('http:', 'https:', url)
SoupDataTemp = GetUrlInfo(url+'index.html')
SoupData = SoupDataTemp.find_all('li')
UrlALL = []
for i in SoupData:
if i.find('a') != None:
volumeUrlRule = '(.*?)'
volumeUrlTemp = re.findall(volumeUrlRule,str(i),re.I)
# u = i.find('a')['href']
# # print(u)
for u in volumeUrlTemp:
if re.findall(url, u[0]):
# print(u)
UrlALL.append((u[0], u[1]), )
# print(UrlALL)
return UrlALL
def GetPaperBaseInfoFromUrlAll(url:str)->str:
"""The input is the url and the output is all the paper information obtained from the web page,
including, doi, title, author, and the date about this volume """
soup = GetUrlInfo(url)
temp1 = soup.find_all('li',class_='entry article')
temp2= soup.find_all('h2')
temp2=re.sub('\\n',' ',temp2[1].text)
# print(temp2)
volumeYear = re.split(' ',temp2)[-1]
paper = []
for i in temp1:
if i.find('div',class_='head').find('a')== None:
paperDoi = ''
else:
paperDoi = i.find('div',class_='head').find('a')['href']
title = i.find('cite').find('span',class_='title').text[:-2]
paper.append([paperDoi,title])
return paper,volumeYear
# test start
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
# put the url into the query
def Write(query,value,num):
for count in range(num):
query.put(value[count][0],True)
# time.sleep(random.random())
print('write end')
# from the query get the url and get the paper info with this url
def Read(query,num,PaperInfo1,COUNT,i,paperNumber):
while True:
count = COUNT.get(True)
# print("before enter" + str(i) + ' - ' + str(count)+' - '+str(num))
COUNT.put(count, True)
if not query.empty():
value = query.get(True)
count = COUNT.get(True)
count = count + 1
COUNT.put(count,True)
paper, thisYear = GetPaperBaseInfoFromUrlAll(value) # just commented
print("connected " + str(i) + ' - ' + str(count) + ' - ' + str(num))
numb = paperNumber.get(True)
numb = numb + len(paper)
paperNumber.put(numb) # just commented
# print(paper,thisYear)
PaperInfo1.put((paper,thisYear),) # just commented
print("the process "+str(i)+' - '+ str(count)+ ' : '+value)
if not COUNT.empty():
count = COUNT.get(True)
# print("after enter" + str(i) + ' - ' + str(count) + ' - ' + str(num))
COUNT.put(count,True)
if int(count) == int(num):
print("the process "+str(i)+" end ")
break
print('read end')
# print the paper info
def GetPaperInfo(PaperInfo1,paperNumber):
for i in range(paperNumber.get(True)):
value = PaperInfo1.get(True)
print(value)
if __name__=='__main__':
r_num = 10 # th read process number
w_num = 1 # th write process number
w_cnt = UrlLen # the write counter
q = Queue(UrlLen) # the volune url queue
paperNumber = Queue(1) # the all paper number
COUNT = Queue(1) # the end tag
COUNT.put(int(0)) # first is zero
paperNumber.put(int(0)) # first is zero
PaperInfo1 = Queue()
r_list = [Process( target=Read, args=(q,w_cnt,PaperInfo1,COUNT,i,paperNumber) ) for i in range(r_num)]
w_list = [Process( target=Write, args=(q,UrlALL,w_cnt) )]
time_start = time.time()
[task.start() for task in w_list]
[task.start() for task in r_list]
[task.join() for task in w_list]
[task.join() for task in r_list]
time_used = time.time() - time_start
GetPaperInfo(PaperInfo1, paperNumber)
print('time_used:{}s'.format(time_used))
I have no idea, with debug the process finally enter the process.py -> row:297: try: self.run() and then enter the row:300: util._exit_function() and just a connected
the debug
but I dont know why the network can cause this error and how to solve this
that's all Thank you!
Hi,this is me again,I tried a concurrent implementation of threads,and global variables for threads are much more comfortable than process queue data sharing. By thread it does implement but my main function can't be stopped, previously with processes it was not possible to proceed to the next step when fetching concurrently, the fetching of data was implemented through threads and continued in the main function but the main function can't be stopped anymore. How interesting!
I have designed three functions similar to the previous ones.
GetUrlintoQueue is to write the fetched url UrlALL to the queue UrlQueue, UrlLen is the number of the url.
import threading
import queue
count = 0 # Record the number of times a value is fetched from the queue
paperNumber = 0 # Record the number of papers
def GetUrlintoQueue(UrlQueue,UrlALL,UrlLen):
for index in range(UrlLen):
UrlQueue.put(UrlALL[index][0], True)
print('Write End')
UrlQueue.task_done()
The other is GetPaperInfofromUrl. Get the url from the UrlQueue and write the information of the corresponding page to PaperInfo, index is the thread number.
def GetPaperInfofromUrl(UrlQueue,PaperInfo,index,UrlLen):
global count,paperNumber
while True:
if not UrlQueue.empty():
url = UrlQueue.get(True)
count = count + 1
paper, thisYear = GetPaperBaseInfoFromUrlAll(url) # just commented
print("connected " + str(index) + '-nd - ' + str(count) + ' - ' + str(UrlLen))
print(paper,thisYear)
paperNumber = paperNumber + len(paper)
PaperInfo.put((paper, thisYear), True)
if count == UrlLen:
print("the process " + str(index) + " end ")
break
UrlQueue.task_done()
PaperInfo.task_done()
print('the process ' + str(index) +' get paper info end')
GetPaperInfo is to show the results about PaperInfo, and it don't change.
def GetPaperInfo(PaperInfo,paperNumber):
for i in range(paperNumber):
value = PaperInfo.get(True)
print(value)
The main function first sets the corresponding variables, then writes directly first, then 10 threads crawl paper information, and finally shows the results, but after displaying the results still can not exit, I can not understand why.
if __name__ == '__main__':
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
UrlQueue = queue.Queue(UrlLen)
PaperInfo = queue.Queue(1000)
WriteThread = 1
ReadThread = 10
# url write
GetUrlThread = [threading.Thread(target=GetUrlintoQueue, args=(UrlQueue,UrlALL,UrlLen,))]
time_start = time.time()
[geturl.start() for geturl in GetUrlThread]
[geturl.join() for geturl in GetUrlThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# url write end
# paperinfo get
PaperinfoGetThread = [threading.Thread(target=GetPaperInfofromUrl, args=(UrlQueue,PaperInfo,index,UrlLen,)) for index in range(ReadThread)]
time_start = time.time()
[getpaper.start() for getpaper in PaperinfoGetThread]
[getpaper.join() for getpaper in PaperinfoGetThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# paperinfo get end
GetPaperInfo(PaperInfo,paperNumber) # show the results
import sys # it does not work
sys.exit()
The debug shows: debug.gif
(I dont have 10 reputation so the picture is the type of link. )
Here is how your process might look using concurrent.futures to manage all the threads and data transport. (not tested)
Adapting an example in the documentation.
from concurrent.futures import ThreadPoolExecutor
def GetPaperInfofromUrl(index,url):
paper, thisYear = GetPaperBaseInfoFromUrlAll(url)
return (index,url,paper,thisYear)
if __name__ == "__main__":
url = 'http://dblp.uni-trier.de/db/journals/talg/'
urls,descr = zip(*GetVolumeUrlfromUrl(url))
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
futs = [executor.submit(GetPaperInfofromUrl, index,url) for index,url in enumerate(urls)]
for future in concurrent.futures.as_completed(futs):
results.append(future.result())
GetPaperInfofromUrl seems superfluous, you could probably refactor GetPaperBaseInfoFromUrlAll and avoid a function call.
I should preface this with I am not a programmer and most of this code was not written by me. I unfortunately have a need and am trying to hack my way through this.
What I am trying to do is chain a few API calls together to ultimately get a list of IPs. What this script does is queries the API and pulls (and prints) a list of device IDs. The device IDs look like this:
akdjlfijoaidjfod
g9jkidfjlskdjf44
3jdhfj4hf9dfiiu4
The device IDs then need to be passed as a parameter in the next API call like this:
https://api.example.com/devices/entities/devices/v1?ids=akdjlfijoaidjfod&ids=g9jkidfjlskdjf44&ids=3jdhfj4hf9dfiiu4 and so on.
I dont know where to begin. Instead of printing the asset ids, I assume they should be stored as a parameter (or variable) and then appended to the URL. I tried doing that with "ID_LIST" but that didnt seem to work. Can you guys point me in the right direction?
import requests
import json
# Define API REST paths
BASE_URL = "https://api.example.com/"
OAUTH_URL_PART = "oauth2/token"
DEVICE_SEARCH = "devices/queries/devices/v1"
DEVICE_DETAILS = "devices/entities/devices/v1"
# Empty auth token to hold value for subsequent request
auth_Token = ""
# Section 1 - Authenticate to Example OAUTH
# Build a dictionary to hold the headers
headers = {
'Content-type': 'application/x-www-form-urlencoded',
'accept': 'application/json'
}
# Build a dictionary to holds the authentication data to be posted to get a token
auth_creds = {}
auth_creds['client_id'] = "<client_id>"
auth_creds['client_secret'] = "<client_secret>"
auth_creds['grant_type'] = "client_credentials"
# Call the API to get a Authentication token - NOTE the authentication creds
print("Requesting token from " + BASE_URL + OAUTH_URL_PART)
auth_response = requests.post(BASE_URL + OAUTH_URL_PART,data=auth_creds, headers=headers)
# Check if successful
if auth_response.status_code != 201:
# Output debug information
print("\n Return Code: " + str(auth_response.status_code) + " " + auth_response.reason)
print("Path: " + auth_response.request.path_url)
print("Headers: ")
print(auth_response.request.headers)
print("Body: " + auth_response.request.body)
print("\n")
print("Trace_ID: " + auth_response.json()['meta']['trace_id'])
else:
# Section 2 - Capture OAUTH token and store in headers for later use
print("Token Created")
# Capture the auth token for reuse in subsequent calls, by pulling it from the response
# Note this token can be reused multiple times until it expires after 30 mins
auth_Token = auth_response.json()['access_token']
headers = {
'authorization':'bearer ' + auth_Token,
'accept': 'application/json'
}
# Section 3 - Reuse authentication token to call other Example OAUTH2 APIs
# Build parameter dictionary
call_params = {}
call_params['offset'] ="" # Non-mandatory param
call_params['limit'] ="5000" # The number of results
call_params['sort'] ="" #
call_params['filter'] ="" # To exclude devices
# Call devices API
print("Searching Asset ID by getting from " + BASE_URL + DEVICE_SEARCH)
DEVICE_search_response = requests.get(BASE_URL + DEVICE_SEARCH,params=call_params,headers=headers)
#DEVICE_DETAILS_response = request.get(BASE_URL + DEVICE_DETAILS,headers=headers)
# Check for errors
if DEVICE_search_response.status_code != 200:
# Output debug information
print("\n Return Code: " + str(DEVICE_search_response.status_code) + " " + DEVICE_search_response.reason)
print("Path: " + DEVICE_search_response.request.path_url)
print("Headers: ")
print(DEVICE_search_response.request.headers)
print("Body: " + DEVICE_search_response.request.body)
print("\n")
print("Trace_ID: " + DEVICE_search_response.json()['meta']['trace_id'])
else:
# Iterate the results and print
result = DEVICE_search_response.json()
print("DEVICE found on " + str(len(result['resources'])) + " the following device id:")
for devices in result['resources']:
print(devices)
###########Part that is not working###########
DEVICE_DETAILS_response = requests.get(BASE_URL + DEVICE_DETAILS,headers=headers)
#ID_LIST = str(len(result['resources']).replace(",", "&ids=")
if DEVICE_DETAILS_response.status_code != 200:
# Output debug information
print("\n Return Code: " + str(DEVICE_DETAILS_response.status_code) + " " + DEVICE_DETAILS_response.reason)
print("Path: " + DEVICE_DETAILS_response.request.path_url)
print("Headers: ")
print(DEVICE_DETAILS_response.request.headers)
print("Body: " + DEVICE_DETAILS_response.request.body)
print("\n")
print("Trace_ID: " + DEVICE_DETAILS_response.json()['meta']['trace_id'])
else:
result = DEVICE_DETAILS_response.json()
print("Device Details Found")
for details in result['resources']:
print(details)
Hi to convert the strings in result['resources']:
['akdjlfijoaidjfod',
'g9jkidfjlskdjf44',
'3jdhfj4hf9dfiiu4']
to : https://api.example.com/devices/entities/devices/v1?ids=akdjlfijoaidjfod&ids=g9jkidfjlskdjf44&ids=3jdhfj4hf9dfiiu4
try this funciton:
def get_modified_url(mylist, myurl):
url = myurl + '?'
for idx, b in enumerate(mylist): # enumerate list to get index and element in the list
if idx > 0:
url += '&ids=' + b # append &ids= to url if not first device id
else:
url += 'ids=' + b # append ids= to url if first device id
return url
print(get_modified_url(result['resources'], BASE_URL + DEVICE_DETAILS ))
full code would be:
def get_modified_url(mylist, myurl):
url = myurl + '?'
for idx, b in enumerate(mylist): # enumerate list to get index and element in the list
if idx > 0:
url += '&ids=' + b # append &ids= to url if not first device id
else:
url += 'ids=' + b # append ids= to url if first device id
return url
device_list = []
DEVICE_search_response = requests.get(BASE_URL + DEVICE_SEARCH,params=call_params,headers=headers)
# Check for errors
if DEVICE_search_response.status_code != 200:
# Output debug information
print("\n Return Code: " + str(DEVICE_search_response.status_code) + " " + DEVICE_search_response.reason)
print("Path: " + DEVICE_search_response.request.path_url)
print("Headers: ")
print(DEVICE_search_response.request.headers)
print("Body: " + DEVICE_search_response.request.body)
print("\n")
print("Trace_ID: " + DEVICE_search_response.json()['meta']['trace_id'])
else:
# Iterate the results and print
result = DEVICE_search_response.json()
print("DEVICE found on " + str(len(result['resources'])) + " the following device id:")
for devices in result['resources']:
print(devices)
device_list.append(devices)
new_url = get_modified_url(device_list, BASE_URL + DEVICE_DETAILS )
DEVICE_DETAILS_response = requests.get(new_url, headers=headers)
if DEVICE_DETAILS_response.status_code != 200:
# Output debug information
print("\n Return Code: " + str(DEVICE_DETAILS_response.status_code) + " " + DEVICE_DETAILS_response.reason)
print("Path: " + DEVICE_DETAILS_response.request.path_url)
print("Headers: ")
print(DEVICE_DETAILS_response.request.headers)
print("Body: " + DEVICE_DETAILS_response.request.body)
print("\n")
print("Trace_ID: " + DEVICE_DETAILS_response.json()['meta']['trace_id'])
else:
result = DEVICE_DETAILS_response.json()
print("Device Details Found")
for details in result['resources']:
print(details)
I have this code below that iterate through some tracks. And then for each track I want to use the musixmatch api to get and print the lyrics of the track based on the artist name and track name.
code that iterete trough some tracks and print the lyrics:
for i, v in tracks.items():
artist = tracks[i]['artist'].replace(" ", "+")
title = tracks[i]['title'].replace(" ", "+")
print(tracks)
print(song_lyric(title, artist))
The print(tracks) returns in this format:
{12: {'trackID': 12, 'title': 'Achtung Baby', 'number': '1', 'artist': 'U2', 'album': 'Achtung Baby', 'albumID': 2, 'duration': '291'}
When the code exuted the lyrics for the firsts tracks are printed, but then it appears an error:
Traceback (most recent call last):
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 239, in <module>
print(song_lyric(title, artist))
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 72, in song_lyric
lyrics_tracking(tracking_url)
File "C:/Users/Ozzy/PycharmProjects/getData/getData.py", line 79, in lyrics_tracking
request = urllib.request.Request(querystring)
File "C:\Users\Ozzy\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 329, in __init__
self.full_url = url
File "C:\Users\Ozzy\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 355, in full_url
self._parse()
File "C:\Users\Ozzy\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 384, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
Do you know why this error is appearing?
The methods to get the lyrics from musixmatch are public available:
def song_lyric(song_name, artist_name):
while True:
querystring = apiurl_musixmatch + "matcher.lyrics.get?q_track=" + urllib.parse.quote(
song_name) + "&q_artist=" + urllib.parse.quote(
artist_name) + "&apikey=" + apikey_musixmatch + "&format=json&f_has_lyrics=1"
# matcher.lyrics.get?q_track=sexy%20and%20i%20know%20it&q_artist=lmfao
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
while True:
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
json_obj = json.loads(raw.decode('utf-8'))
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
copyright = json_obj["message"]["body"]["lyrics"]["lyrics_copyright"]
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
#print(tracking_url)
lyrics_tracking(tracking_url)
return (body + "\n\n" + copyright)
def lyrics_tracking(tracking_url):
while True:
querystring = tracking_url
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
print(raw)
Full working exemple that reproduces the error:
import requests
import json
import urllib.request, urllib.error, urllib.parse
import socket
apikey_musixmatch = '0b4a363bbd71974c2634837d5b5d1d9a' #generated for the example
apiurl_musixmatch = 'http://api.musixmatch.com/ws/1.1/'
api_key = "b088cbedecd40b35dd89e90f55227ac2" #generated for the example
def song_lyric(song_name, artist_name):
while True:
querystring = apiurl_musixmatch + "matcher.lyrics.get?q_track=" + urllib.parse.quote(
song_name) + "&q_artist=" + urllib.parse.quote(
artist_name) + "&apikey=" + apikey_musixmatch + "&format=json&f_has_lyrics=1"
# matcher.lyrics.get?q_track=sexy%20and%20i%20know%20it&q_artist=lmfao
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
while True:
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
json_obj = json.loads(raw.decode('utf-8'))
body = json_obj["message"]["body"]["lyrics"]["lyrics_body"]
copyright = json_obj["message"]["body"]["lyrics"]["lyrics_copyright"]
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
print("Tracking_url====================" +tracking_url + "==================================")
lyrics_tracking(tracking_url)
return (body + "\n\n" + copyright)
def lyrics_tracking(tracking_url):
while True:
querystring = tracking_url
request = urllib.request.Request(querystring)
# request.add_header("Authorization", "Bearer " + client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)") # Must include user agent of some sort, otherwise 403 returned
try:
response = urllib.request.urlopen(request,
timeout=4) # timeout set to 4 seconds; automatically retries if times out
raw = response.read()
except socket.timeout:
print("Timeout raised and caught")
continue
break
print(raw)
ID = 0
#get top artists from country
artists = {}
for i in range(2, 3):
artists_response = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=geo.gettopartists&country=spain&format=json&page=' + str(i) + '&api_key=' + api_key)
artists_data = artists_response.json()
for artist in artists_data["topartists"]["artist"]:
name = artist["name"]
url = artist["url"]
if ID > 1: continue
artists[ID] = {}
artists[ID]['ID'] = ID
artists[ID]['name'] = name
ID += 1
for i, v in artists.items():
chosen = artists[i]['name'].replace(" ", "+")
artist_response = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=artist.getinfo&format=json&artist=' + chosen + '&api_key=' + api_key)
artist_data = artist_response.json()
# get top albums of the artists
albums = {}
for i, v in artists.items():
chosen = artists[i]['name'].replace(" ", "+")
topalbums_response = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=artist.gettopalbums&format=json&artist=' + chosen + '&api_key=' + api_key + '&limit=5')
albums_data = topalbums_response.json()
for album in albums_data['topalbums']['album']:
name = album["name"]
url = album["url"]
albums[ID] = {}
albums[ID]['ID'] = ID
albums[ID]['artist'] = artists[i]['name']
albums[ID]['artistID'] = artists[i]['ID']
albums[ID]['name'] = name
ID += 1
# Get tracks of the album
tracks = {}
for i, v in albums.items():
artist = albums[i]['artist'].replace(" ", "+")
name = albums[i]['name'].replace(" ", "+")
album_response_data = requests.get(
'http://ws.audioscrobbler.com/2.0/?method=album.getinfo&format=json&api_key=' + api_key + '&artist=' + artist + '&album=' + name)
album_response = album_response_data.json()
for album in album_response['album']['tracks']['track']:
title = album['name']
tracks[ID] = {}
tracks[ID]['trackID'] = ID
tracks[ID]['title'] = title
tracks[ID]['artist'] = albums[i]['artist']
tracks[ID]['album'] = albums[i]['name']
tracks[ID]['albumID'] = albums[i]['ID']
ID += 1
for i, v in tracks.items():
artist = tracks[i]['artist'].replace(" ", "+")
title = tracks[i]['title'].replace(" ", "+")
# print the lyric of each track
print(song_lyric(title, artist))
It seems like url is not correct. It happens here:
tracking_url = json_obj["message"]["body"]["lyrics"]["html_tracking_url"]
If you have ability to run that API locally and see what is returned into tracking_url, you can find out what is still wrong with it.
UPDATE:
I reproduced it, so the urllib.request cannot process empty string URL: "", that is why you need to check if the tracking_url != "" and only if its not empty string or None you need to request for the song.
I am using a weather API to design a slack bot service using python.
My source code is-
import requests
import re
import json
from bs4 import BeautifulSoup
def weather(cityname):
cityid = extractid(cityname)
url = "http://api.openweathermap.org/data/2.5/forecast?id=" + str(cityid) + "&APPID=c72f730d08a4ea1d121c8e25da7e4411"
while True:
r = requests.get(url, timeout=5)
while r.status_code is not requests.codes.ok:
r = requests.get(url, timeout=5)
soup = BeautifulSoup(r.text)
data = ("City: " + soup.city["name"] + ", Country: " + soup.country.text + "\nTemperature: " + soup.temperature["value"] +
" Celsius\nWind: " + soup.speed["name"] + ", Direction: " + soup.direction["name"] + "\n\n" + soup.weather["value"])
# print data
return data
def extractid(cname):
with open('/home/sourav/Git-Github/fabulous/fabulous/services/city.list.json') as data_file:
data = json.load(data_file)
for item in data:
if item["name"] == cname:
return item["id"]
def on_message(msg, server):
text = msg.get("text", "")
match = re.findall(r"~weather (.*)", text)
if not match:
return
searchterm = match[0]
return weather(searchterm.encode("utf8"))
on_bot_message = on_message
But executing the code gives the following error-
File "/usr/local/lib/python2.7/dist-packages/fabulous-0.0.1-py2.7.egg/fabulous/services/weather.py", line 19, in weather
" Celsius\nWind: " + soup.speed["name"] + ", Direction: " + soup.direction["name"] + "\n\n" + soup.weather["value"])
TypeError: 'NoneType' object has no attribute '__getitem__'
I can't figure out what's the error. Please help!
__getitem__ is called when you ask for dictionary key like a['abc'] translates to a.__getitem__('abc')
so in this case one attribute of soup is None (speed, direction or weather)
ensure that your r.text contains data you want, simply print it:
print(r.text)
list structure in parsed data:
for child in soup.findChildren():
print child
always assume your entry data might be wrong, instead doing soup.city do soup.find('city'), it might be empty so:
city = soup.find('city')
if len(city):
city_name = city[0]['name']
else:
city_name = 'Error' # or empty, or sth
I think I have a very basic understanding of how threading works, but since I don't know that much I can't figure this out. I want to have a pool limit of about 10 threads, but the tricky part is I don't know how to make it read line by line.
proxies = {
'http': 'http://123.10.210.213:9999',
'https': 'http://123.10.210.213:9999'
}
def create_proxy_lst(txt):
print("""
########################################
# WORKING | NOT WORKING #
########################################
""")
proxy_list = []
with open(txt) as f:
for line in f:
proxy_list.append(line.strip('\n'))
return proxy_list
def check_proxy(website="https://google.com/"):
working = 0
not_working = 0
total = 0
lst = create_proxy_lst("uncheckedproxys.txt")
for proxy in lst:
try:
proxies["https"] = "http://" + proxy
proxies["http"] = "http://" + proxy
r = requests.get(website, timeout=1, proxies=proxies)
if r.status_code == 200:
print("%s" % proxy)
working += 1
total += 1
os.system("title Working: " + str(working) + "\t Not working " + str(not_working) + " ✔" + " Total: " + str(total) + "/" + str(len(lst)))
except Exception:
print("\t\t %s" % proxy)
not_working += 1
total += 1
os.system("title Working: " + str(working) + "\t Not working " + str(not_working) + " ✖" + " Total: " + str(total) + "/" + str(len(lst)))
Put your proxies into a Queue.Queue(), and then start 10 threads to read proxy from the queue.
In your case:
from Queue import Queue
from threading import Thread
def worker(proxy_queue):
while not proxy_queue.empty():
proxy = proxy_queue.get()
working = 0
not_working = 0
total = 0
try:
proxies["https"] = "http://" + proxy
proxies["http"] = "http://" + proxy
r = requests.get(website, timeout=1, proxies=proxies)
if r.status_code == 200:
print("%s" % proxy)
working += 1
total += 1
os.system("title Working: " + str(working) + "\t Not working " + str(not_working) + " ✔" + " Total: " + str(total) + "/" + str(len(lst)))
except Exception:
print("\t\t %s" % proxy)
not_working += 1
total += 1
os.system("title Working: " + str(working) + "\t Not working " + str(not_working) + " ✖" + " Total: " + str(total) + "/" + str(len(lst)))
if __name__ == '__main__':
# Build a queue
proxy_queue = Queue()
# Put these proxies into the queue
with open("uncheckedproxys.txt") as f:
for line in f:
proxy_queue.put(line.strip())
# Create thread pool
thread_pool = [Thread(target=worker, args=proxy_queue) for i in range(10)]
# Start threads
for thread in thread_pool:
thread.start()