I was trying to download images from all the cases included in CaseIDs array, but it doesn't work. I want code to run for all cases.
from bs4 import BeautifulSoup
import requests as rq
from urllib.parse import urljoin
from tqdm import tqdm
CaseIDs = [100237, 99817, 100271]
with rq.session() as s:
for caseid in tqdm(CaseIDs):
url = 'https://crashviewer.nhtsa.dot.gov/nass-CIREN/CaseForm.aspx?xsl=main.xsl&CaseID= {caseid}'
r = s.get(url)
soup = BeautifulSoup(r.text, "html.parser")
url = urljoin(url, soup.find('a', text='Text and Images Only')['href'])
r = s.get(url)
soup = BeautifulSoup(r.text, "html.parser")
links = [urljoin(url, i['src']) for i in soup.select('img[src^="GetBinary.aspx"]')]
count = 0
for link in links:
content = s.get(link).content
with open("test_image" + str(count) + ".jpg", 'wb') as f:
f.write(content)
count += 1
try use format() like this:
url = 'https://crashviewer.nhtsa.dot.gov/nass-CIREN/CaseForm.aspx?xsl=main.xsl&CaseID={}'.format(caseid)
You need to use an f-string to pass your caseId value in, as you're trying to do:
url = f'https://crashviewer.nhtsa.dot.gov/nass-CIREN/CaseForm.aspx?xsl=main.xsl&CaseID= {caseid}'
(You probably also need to remove the space between the = and the {)
Related
My code below works but I want it to do the same exact thing but with the next page of the URL variable, this would be done by adding the number 1,2,3 depending on the page.
The code essentially scrapes a website that has the thumnails of various videos, it then returns the link to each video. I want it to do this for each page available
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
from urllib.request import Request, urlopen
URL = "domain.com/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
endof = soup.find_all('div',class_="th-image")
links = [a['href'] for a in soup.find_all('a', href=True)]
endoflinks = links[8:-8]
index = 0
for a in endoflinks:
index+=1
dwnlink = "domain.com"+ endoflinks[index]
r = requests.get(dwnlink)
f = open("output.txt", "a")
print(r.url, file=f)
f.close()
This should help you get going:
URL = "domain.com/"
for i in list(range(0,10)):
print("domain.com/"+str(i))
r = requests.get(URL+str(i))
f = open("output.txt", "a")
print(r.url, file=f)
f.close()
domain.com/0
domain.com/1
domain.com/2
domain.com/3
domain.com/4
domain.com/5
domain.com/6
domain.com/7
domain.com/8
domain.com/9
I'm learning python and webscraping, It is very cool but I am not able to get what I want.
I'm trying to save products links in a text file to scrape data after.
here is my script, which work correctly (almost) in the console of pycharm :
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
My goal is to save the result of the links variable, line by line in a text file.
I tried this, but something is wrong and I can't get each url :
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Please, does someone can help me?
You can try this way.
Just open the file once and write the complete data to it. Opening and closing files inside a loop is not a good thing to do.
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
with open('text.txt', 'w') as f:
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = 'https://www.topachat.com/' + a.get('href')
f.write(link+'\n')
Sample output from text.txt
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in11020650.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10119254.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20005046.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002036.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002591.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20004309.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002592.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10089390.html
.
.
.
Your problem is in for link in links line:
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links)
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Type of links is string and your for loop iterates it letter-by-letter (or characater-by-character). That is why you see a single character at each line in your txt file. You can just remove the for loop and the code will work:
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
with open("urls.txt", "a") as f:
f.write(links+"\n")
You can do like this:
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
url_list = set()
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
url_list.add(links)
with open("urls.txt", "a") as f:
for link in url_list:
f.write(link+"\n")
I need to download all the files from this page :
https://www.dmo.gov.uk/publications/?offset=0&itemsPerPage=1000000&parentFilter=1433&childFilter=1433%7C1450&startMonth=1&startYear=2008&endMonth=6&endYear=2021
that have "Auction of" on their titles. This is the source for one of the files for example:
Auction of £2,500 million of 0 5/8% Treasury Gilt 2035
I am trying to adapt some code I found from another question, but the pages are coming back empty:
import os
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def download_pgn(task):
session, url, destination_path = task
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
game_url = host + soup.find("a", text="download").get("href")
filename = re.search(r"\w+\.pgn", game_url).group()
path = os.path.join(destination_path, filename)
response = session.get(game_url, stream=True)
response.raise_for_status()
with open(path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
if __name__ == "__main__":
destination_path = "pgns"
max_workers = 8
if not os.path.exists(destination_path):
os.makedirs(destination_path)
with requests.Session() as session:
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
pages = soup.find_all("a", href=re.compile(r".*Auction of\?.*"))
tasks = [
(session, host + page.get("href"), destination_path)
for page in pages
]
with ThreadPoolExecutor(max_workers=max_workers) as pool:
pool.map(download_pgn, tasks)
Check your regular expression syntax. The regex r".*Auction of\?.*" will only match titles with an actual of? in the title.
But the href= parameter will search against the URL in the link, so that won't help you much either. This will find the links with the matching titles:
links = soup.find_all("a", string=re.compile(r"Auction of\b"))
And this will extract their URLs so you can retrieve them:
[ file["href"] for file in links ]
This is what ended up working for me:
from bs4 import BeautifulSoup
import requests
import re
links = []
url = 'https://www.dmo.gov.uk/publications/?offset=0&itemsPerPage=1000000000&parentFilter=1433&childFilter=1433|1450&startMonth=1&startYear=2000&endMonth=6&endYear=2021'
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
for a in soup.find_all("a",{"aria-label":re.compile(r"^Auction of\b")}, href=True):
links.append(a['href'])
def download_file(url):
path = url.split('/')[-1].split('?')[0]
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
host = 'https://www.dmo.gov.uk/'
for link in links:
url = host + link
download_file(url)
The find_all() method accepts a function. You can create a lambda function to filter all for a tags that contain "Auction of":
for tag in soup.find_all(lambda t: t.name == "a" and "Auction of" in t):
print(tag.text)
Or, you can use an [attribute*=value]:
# Find all `aria-label` attributes under an `a` that contain `Auction of`
for tag in soup.select("a[aria-label*='Auction of']"):
print(tag.text)
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
data = requests.get(url)
soup = bs(data.content,"html.parser")
The code below are a test with to get 1 item.
property_overview = soup.find(class_="p24_regularListing").find(class_="p24_propertyOverview").find(class_='p24_propertyOverviewRow').find(class_='col-xs-6 p24_propertyOverviewKey').text
property_overview
Output : 'Listing Number'
The code below is what we have to get all the col-xs-6 p24_propertyOverviewKey
p24_regularListing_items = soup.find_all(class_="p24_regularListing")
for p24_propertyOverview_item in p24_regularListing_items:
p24_propertyOverview_items = p24_propertyOverview_item.find_all(class_="p24_propertyOverview")
for p24_propertyOverviewRow_item in p24_propertyOverview_items:
p24_propertyOverviewRow_items = p24_propertyOverviewRow_item.find_all(class_="p24_propertyOverviewRow")
for p24_propertyOverviewKey_item in p24_propertyOverviewRow_items:
p24_propertyOverviewKey_items = p24_propertyOverviewKey_item.find_all(class_="col-xs-6 p24_propertyOverviewKey")
p24_propertyOverviewKey_items
The code above only outputs 1 item. and not all
To put things more simply, you can use soup.select() (and via the comments, you can then use .get_text() to extract the text from each tag).
from bs4 import BeautifulSoup
import requests
resp = requests.get(
"https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
for tag in soup.select(
# NB: this selector uses Python's implicit string concatenation
# to split it onto several lines.
".p24_regularListing "
".p24_propertyOverview "
".p24_propertyOverviewRow "
".p24_propertyOverviewKey"
):
texts.append(tag.get_text())
print(texts)
This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.