How can I download music files from websites using #Python
this code
from bs4 import BeautifulSoup
from requests import *
import urllib
link = input("https://www.chosic.com/free-music/all/")
url = urllib.request.urlopen(link)
content = url.read()
soup = BeautifulSoup(content,'html.parser')
for audio in soup.find_all('audio'):
print(len(audio))
Traceback (most recent call last):
File "C:\Users\pc\Desktop\Downloads files from url using python .py", line 8, in <module>
url = urllib.request.urlopen(link)
File "C:\Program Files\Python39\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\Python39\lib\urllib\request.py", line 501, in open
req = Request(fullurl, data)
File "C:\Program Files\Python39\lib\urllib\request.py", line 320, in __init__
self.full_url = url
File "C:\Program Files\Python39\lib\urllib\request.py", line 346, in full_url
self._parse()
File "C:\Program Files\Python39\lib\urllib\request.py", line 375, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
I want through the website link to extract the links mp3 and wav
Please someone who can help me
You can use next example how to download all mp3 files from that page:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"
}
url = "https://www.chosic.com/free-music/all/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for u in soup.select("[data-url]"):
u = u["data-url"]
print("Downloading {}".format(u))
with open(u.split("/")[-1], "wb") as f_out:
f_out.write(requests.get(u, headers=headers).content)
Prints:
Downloading https://www.chosic.com/wp-content/uploads/2020/06/John_Bartmann_-_09_-_Happy_Clappy-1.mp3
Downloading https://www.chosic.com/wp-content/uploads/2020/11/batchbug-sweet-dreams.mp3
Downloading https://www.chosic.com/wp-content/uploads/2021/01/fm-freemusic-inspiring-optimistic-upbeat-energetic-guitar-rhythm.mp3
Downloading https://www.chosic.com/wp-content/uploads/2021/02/keys-of-moon-white-petals.mp3
...and so on.
and saves the *mp3 files.
Related
I'm just starting/learning to use the Google Cloud platform (functions in particular) and I wrote a simple python scraper using BeautifulSoup that is returning an error and I can't figure out why.
from bs4 import BeautifulSoup
import requests
def hello_world(request):
"""Responds to any HTTP request.
Args:
request (flask.Request): HTTP request object.
Returns:
The response text or any set of values that can be turned into a
Response object using
`make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
"""
url = 'https://example.com/'
req = requests.get(url, headers = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
html = req.text
soup = BeautifulSoup(html, 'html.parser')
title = soup.title
print(title)
return title
When I print the title of the scraped page, that shows up in the logs fine. When I return the variable though, the logs report an "IndexError: list index out of range". When I return soup.prettify() it also works fine.
This is the Traceback that I get in the GCP logs
Traceback (most recent call last): File "/layers/google.python.pip/pip/lib/python3.9/site-packages/flask/app.py", line 2447, in wsgi_app response = self.full_dispatch_request() File "/layers/google.python.pip/pip/lib/python3.9/site-packages/flask/app.py", line 1953, in full_dispatch_request return self.finalize_request(rv) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/flask/app.py", line 1968, in finalize_request response = self.make_response(rv) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/flask/app.py", line 2117, in make_response rv = self.response_class.force_type(rv, request.environ) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/werkzeug/wrappers/base_response.py", line 269, in force_type response = BaseResponse(*_run_wsgi_app(response, environ)) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/werkzeug/wrappers/base_response.py", line 26, in _run_wsgi_app return _run_wsgi_app(*args) File "/layers/google.python.pip/pip/lib/python3.9/site-packages/werkzeug/test.py", line 1123, in run_wsgi_app return app_iter, response[0], Headers(response[1]) IndexError: list index out of range
The problem is probably caused by wrong indentation.
By the way try with this code, maybe it easier to undersand:
from bs4 import BeautifulSoup
import requests
url = 'https://stackoverflow.com'
def titleScaper(url):
req = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"})
soup = BeautifulSoup(req.content, 'html.parser')
soup.encode('utf-8')
return soup.title.get_text()
title = titleScaper(url)
print(title)
I am brand new to using BeautifulSoup and I am running into an odd issue, likely user error, but I am stumped! I am using BeautifulSoup to parse through a webpage, and return the first a tag with an href attribute. When I use the Wikipedia link, it works as expected! However when I use the BestBuy link, it leads to this timeout...
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
# url = r"https://en.wikipedia.org/wiki/Eastern_Front_(World_War_II)"
url = r"https://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?skuId=6364255"
html_content = urllib.request.urlopen(url)
soup = BeautifulSoup(html_content, 'html.parser')
link = soup.find('a', href=True)
print(link)
Traceback (most recent call last):
File "scrapper.py", line 8, in <module>
html_content = urllib.request.urlopen(url)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1354, in do_open
r = h.getresponse()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1347, in getresponse
response.begin()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 307, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 268, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "/usr/local/Cellar/python#3.8/3.8.5/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
TimeoutError: [Errno 60] Operation timed out
Do you guys have any insight as to why this might be happening with only certain URL's? Thanks in advance!
You cannot scrape all websites using BeautifulSoap, some websites have restrictions. Best practice is always use headers:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
url = r"https://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?skuId=6364255"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup.prettify())
Output:
<html>
<head>
<title>
Access Denied
</title>
</head>
<body>
<h1>
Access Denied
</h1>
You don't have permission to access "http://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?" on this server.
<p>
Reference #18.9f01d517.1595655333.b833c
</p>
</body>
</html>
You can achieve this task using selenium, follow below steps:
Step 1: Download the web driver for chrome:
First check your chrome version(Browser's Menu(triple vertical dots) -> Help -> About Google Chrome
Step 2: Download Driver from here according to your chrome browser version(mine is 81.0.4044.138)
Step 3: Once downloaded unzip the file and place chromedriver.exe in the directory where your script is.
Step 4: pip install selenium
Now use the below code:
from selenium import webdriver
import os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
#your website url
site = 'https://www.bestbuy.com/site/nintendo-switch-32gb-console-neon-red-neon-blue-joy-con/6364255.p?skuId=6364255'
#your driver path
driver = webdriver.Chrome(executable_path = 'chromedriver.exe')
#passing website url
driver.get(site)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
link = soup.find('a', href=True)
print(link)
Output:
<a href="https://www.bestbuy.ca/en-CA/home.aspx">
<img alt="Canada" src="https://www.bestbuy.com/~assets/bby/_intl/landing_page/images/maps/canada.svg"/>
<h4>Canada</h4>
</a>
My current program looks like this
import os
import urllib.request
baseUrl = "https://website.com/wp-content/upload/xxx/yyy/zzz-%s.jpg"
for i in range(1,48):
url = baseUrl % i
urllib.request.urlretrieve(baseUrl, os.path.basename(url))
I haven't coded python in a long time, but I wrote this using urllib2 back when I used to use Python2.7.
It is supposed to replace the %s in the URL and loop through 1-48, and download all the images to the directory that the script is in. But i get alot of errors.
edit : Here is the error that is thrown.
Traceback (most recent call last):
File "download.py", line 9, in <module>
urllib.request.urlretrieve(url, os.path.basename(url))
File "C:\Program Files\Python37\lib\urllib\request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Program Files\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\Python37\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Program Files\Python37\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Program Files\Python37\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Program Files\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Program Files\Python37\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
urllib.request is only available on Python 3 so you have to run the code in Python 3.
Try using the requests module:
import requests
baseUrl = "https://website.com/wp-content/upload/xxx/yyy/zzz-%s.jpg"
for i in range(1,48):
url = baseUrl % i
response = requests.get(url)
my_raw_data = response.content
with open(os.path.basename(url), 'wb') as my_data:
my_data.write(my_raw_data)
my_data.close()
Just to add, you must use url in the request, not the baseUrl as shown in your code :
import os
import urllib.request
baseUrl = "https://website.com/wp-content/upload/xxx/yyy/zzz-%s.jpg"
for i in range(1,48):
url = baseUrl % i
#urllib.request.urlretrieve(baseUrl, os.path.basename(url))
#Use This line :
urllib.request.urlretrieve(url, os.path.basename(url))
Run this in Python 3
Simple fix, if you pass the correct string:
urllib.request.urlretrieve(url, os.path.basename(url))
The documentation says urlretrieve is a Legacy carryover, so you might want to find a different way to do this.
I found this alternate approach modified from another SO answer:
import os
import requests
baseUrl = "https://website.com/wp-content/upload/xxx/yyy/zzz-%s.jpg"
for i in range(1,48):
url = baseUrl % i
r = requests.get(url)
open(os.path.basename(url), 'wb').write(r.content)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://www.csgoanalyst.win'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
page_soup.body
I am trying to scrape hltv.org in order to find out what maps each team bans and picks. However, I keep getting the following error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/anaconda/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/anaconda/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/anaconda/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/anaconda/lib/python3.6/urllib/request.py", line 570, in error
return self._call_chain(*args)
File "/anaconda/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/anaconda/lib/python3.6/urllib/request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
>>> page_html = uClient.read()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
NameError: name 'uClient' is not defined
>>> uClient.close()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
NameError: name 'uClient' is not defined
I tried the script on another website so I know it works. I assume hltv has blocked bots or whatever from doing this and I know I shouldn't particularly be doing it if they don't want people to but I would love to get the data.
Any help will be super helpful.
Thank you.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.hltv.org/stats/teams/maps/6665/Astralis'
u_client = uReq(my_url)
soup = bs.BeautifulSoup(u_client,"html.parser")
print soup
and if you want to remove the tags
import bleach
print bleach.clean(soup,tags = [],strip = True)
I suggest you use requests module instead of urllib. It's fast, and has other advantages as well. You're getting forbidden because you're lacking a User-Agent header. Try something like the following:
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
my_url = 'https://www.hltv.org/stats/teams/maps/6665/Astralis'
page = requests.get(my_url, headers=headers)
page_html = page.text
requests can be easily installed using pip: pip install requests
You can add headers using urllib as well, but it's slightly more complex syntactically and perhaps slower.
I want to send my application/zip data to the server without pycurl or other libs. I am newbie with cURL. Firstly i succesfully sent text/xml data with this code
import urllib2
req = urllib2.Request("http://192.168.79.131/rest", headers = {"Content-type" : "text/xml" , "Accept" : "*/*"} , data = '<income><name>acme7</name></income>')
f = urllib2.urlopen(req)
But now i want to upload my ZIP file to the server. I tried this code:
import urllib2
zipPath = "c:/somedir/ways.zip"
zipData = open(zipPath, "rb")
req = urllib2.Request("http://192.168.79.131/rest", headers = {"Content-type" : "application/zip" , "Accept" : "*/*"} , data = zipData)
f = urllib2.urlopen(req)
I got these errors:
Traceback (most recent call last):
File "<pyshell#25>", line 1, in <module>
f = urllib2.urlopen(req)
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 386, in open
protocol = req.get_type()
File "C:\Python27\lib\urllib2.py", line 248, in get_type
**raise ValueError, "unknown url type: %s" % self.__original
ValueError: unknown url type: /rest/income**
Have you considered using something like Requests? It handles a lot of the urllib2 stuff so you don't have to:
import requests
url = 'http://httpbin.org/post'
files = {'file': open('c:/somedir/ways.zip', 'rb')}
r = requests.post(url, files=files)
print r
Prints:
>>> <Response [200]>