Download .pdf files from a webiste using python - python

i am trying to download all the pdfs from the webiste provided and i am using the following code:
import mechanize
from time import sleep
br = mechanize.Browser()
br.open('http://www.nerc.com/comm/CCC/Pages/AgendasHighlightsandMinutes-.aspx')
f=open("source.html","w")
f.write(br.response().read())
filetypes=[".pdf"]
myfiles=[]
for l in br.links():
for t in filetypes:
if t in str(l):
myfiles.append(l)
def downloadlink(l):
f=open(l.text,"w")
br.click_link(l)
f.write(br.response().read())
print l.text," has been downloaded"
for l in myfiles:
sleep(1)
downloadlink(l)
keep on getting the following error and can't figure out the problem why.
legal and privacy has been downloaded
Traceback (most recent call last):
File "downloads-pdfs.py", line 29, in <module>
downloadlink(l)
File "downloads-pdfs.py", line 21, in downloadlink
f=open(l.text,"w")
IOError: [Errno 13] Permission denied: u'/trademark policy'

The problem you encounter arises because you use the link URL as a filename. The character '/' is not valid in a filename. Try to modify your downloadlink function to something like this:
def downloadlink(l):
filename = l.text.split('/')[-1]
with open(filename, "w") as f:
br.click_link(l)
f.write(br.response().read())
print l.text," has been downloaded"

Related

When I copy my instructor's website blocker code, why do I get a permission error even though his works and mine doesn't

So I am trying to code a website blocker. When I finished the code, I get a permission error. I am using a Windows computer. Can you please tell me what I am doing wrong or help fix my problem? Thank you in advance.
import time
from datetime import datetime as dt
hosts_path = r"C:\Windows\System32\drivers\etc\hosts"
redirect = "127.0.0.1"
website_list = ["https://www.youtube.com/", "youtube.com"]
final_list = [redirect + " "+ i for i in website_list]
final_string_block = "\n".join(final_list)
while True:
if dt(dt.now().year, dt.now().month, dt.now().day, 8,) < dt.now() < dt(dt.now().year, dt.now().month, dt.now().day,18):
print("Within Time...")
with open(hosts_path, "r+") as file:
content = file.read()
for website in website_list:
if website in content:
pass
else:
file.write(redirect+ ""+website+"\n")
else:
with open(hosts_path, "r+") as file:
content = file.readlines()
file.seek(0)
for line in content:
if not any(website in line for website in website_list):
file.write(line)
file.truncate()
time.sleep(5)
This is the error:
Traceback (most recent call last):
File "c:\Users\chris\.vscode\Realistic Programs\tempCodeRunnerFile.python", line 13, in <module>
with open(hosts_path, "r+") as file:
PermissionError: [Errno 13] Permission denied: 'C:\\Windows\\System32\\drivers\\etc\\hosts'
Because the hosts file is applicable to all users you need administrative privileges to write to it. Try running python as administrator and you should be able to edit the file.

Selenium after take screenshot unlink txt file

I have some txt files in a folder, I have listed directory and garbed links. After visit links using selenium I have taken screenshot. Now I am trying do delete this link txt file.
Below code I have tried
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
import os
path = "E:/xampp/htdocs/spool/"
directories = os.listdir(path)
for dir in directories:
# print(dir)
files = os.listdir(path+dir)
for file in files:
# print(path+dir+'/'+file)
f = open(path+dir+'/'+file, "r")
list = f.read()
data = list.split("||")
print(data[1])
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(data[1])
driver.save_screenshot(data[0]+'.png')
driver.close()
os.unlink(f.name)
Problem is unlink time it's giving below error
Traceback (most recent call last):
File "index.py", line 21, in <module>
os.unlink(f.name)
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'E:/xampp/htdocs/spool/7/2020-09-1112.txt'
I have also used os.close(3), After that got error "
list = f.read()
OSError: [Errno 9] Bad file descriptor
"
How can I unlink after take screenshot ?
Version python : Python 3.8.4
As u can see another process is using the txt file.
I think that's the problem here; you opened the file and didn't closed it.
I suggets you to visit https://www.tutorialspoint.com/python/file_close.htm
Try to call f.close() and then unlink.
This method of file handling is not entirely safe.
If an exception occurs when we are performing some operation with the file, the code exits with out closing the file.
In your case you forgot to close the file.
f.close()
I would recommend to use this approach to avoid such scenarios
with open("test.txt", mode= 'r', encoding = 'utf-8') as f:
# perform file operations
pass
# we dont need to explicitly close() the method, it is done internally.

getting permission denied error on downloading and saving images from url

I'm trying to make web scraper that downloads images from searched keywords. The code works completely fine until it has to download that image from extracted URL
from bs4 import BeautifulSoup
import requests
import os
import urllib
search = raw_input("search for images: ")
params = {"q": search}
r = requests.get("http://wwww.bing.com/images/search", params=params)
dir_name = search.replace(" ", "_").lower()
if not os.path.isdir(dir_name):
os.makedirs(dir_name)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for items in links:
img_obj = requests.get(items.attrs["href"])
print "Getting: ", items.attrs["href"]
title = items.attrs["href"].split("/")[-1]
urllib.urlretrieve(items.attrs["href"], "./scraped_images/")
OUTPUT:
search for images: cats
Getting: http://c1.staticflickr.com/3/2755/4353908962_2a0003aebf.jpg
Traceback (most recent call last):
File "C:/Users/qazii/PycharmProjects/WebScraping/exm.py", line 21, in <module>
urllib.urlretrieve(items.attrs["href"], "./scraped_images/")
File "E:\anaconda\envs\WebScraping\lib\urllib.py", line 98, in urlretrieve
return opener.retrieve(url, filename, reporthook, data)
File "E:\anaconda\envs\WebScraping\lib\urllib.py", line 249, in retrieve
tfp = open(filename, 'wb')
IOError: [Errno 13] Permission denied: './scraped_images/'
You're attempting to save the image to a "file" named ./scraped_images/. Since this is a directory and not a file, you get a permissions error (you can't open a directory with write permissions). Instead, try saving to a specific file name.
urllib.urlretrieve(items.attrs["href"], os.path.join("./scrapped_images", title))

copy the snippet java file in middle of temp file in python

I am a beginner in python3
I want to copy snippet java file in the middle of other temp file which i gain the address of this file from downloading URL.
my problem is when i execute my program i have this error:
RESTART: C:/Users/user/AppData/Local/Programs/Python/Python36/refactordwon.py
the Url is:
('C:\\Users\\user\\AppData\\Local\\Temp\\tmpq5m7m_og', <http.client.HTTPMessage object at 0x0000003A854879E8>)
Traceback (most recent call last):
File "C:/Users/user/AppData/Local/Programs/Python/Python36/refactordwon.py", line 14, in <module>
file_out = open("path_file" , "r")
FileNotFoundError: [Errno 2] No such file or directory: 'path_file'
>>>
i do not know why?
because when i download the url, url shows me this address:
the Url is:
('C:\\Users\\user\\AppData\\Local\\Temp\\tmpey3yovte', <http.client.HTTPMessage object at 0x0000002233347978>)
i tried to use this address in different way but anyway i have error. i found the temp file and copied in python address.
I am sure that i have this file and the address is correct but again i have error that can not find file.
could you help me, please?!
I hope my question is clear
my code is:
import urllib.request
import os
import tempfile
#download URL
#[-------------------------
url = 'http://pages.di.unipi.it/corradini/Didattica/AP-17/PROG-ASS/03/assignment3.html'
gt_url = urllib.request.urlretrieve(url)
print("the Url is: ")
print(gt_url)
#--------------------------]
#copy sniper java file inside remote file
#[--------------------------
path_file =r'C:/Users/user/AppData/Local/Programs/Python/Python36/tmpokv2s_dw'
file_out = open("path_file" , "r")
file_in = open("snip1.java", "r")
file_out.readlines()
open("file_back", "w")
file_back.write(file_out)
pos_fileout = file_back.tell()
file_back.seek(pos_fileout)
file_back.write(file_in)
print("the content of file is: ")
file_back.close()
file_out.close()
file_in.close()
open("file_back", "r")
file_back.readlines()
print(file_back)
file_back.close()

OSError: [Errno 22] Invalid argument for open()

import requests
from bs4 import BeautifulSoup
url = input("URL:")
grab_page = requests.get(url)
parse_page = BeautifulSoup(grab_page.text, "html.parser")
file_name = parse_page.title.string.replace("\\,()", "")
newfile = open(file_name + ".html", "w+")
newfile.write(grab_page.text)
When I try to run the above code, with this particular URL, where the title of webpage is "How to Install JDK 8 (on Windows,
Mac OS, Ubuntu) and Get Started with Java Programming" I received the following error:
Traceback (most recent call last):
File "C:/Users/LKT/PycharmProjects/webpagegrabber/main.py", line 12, in <module>
newfile = open(file_name + ".html", "w+")
OSError: [Errno 22] Invalid argument: 'How to Install JDK 8 (on Windows,\r\nMac OS, Ubuntu)
and Get Started with Java Programming.html'
Where did I go wrong?
Your file name contains invalid characters (\n, \r). So you cannot create such a file in Windows. As described in the Windows Developer Center:
Characters whose integer representations are in the range from 1
through 31, except for alternate data streams where these characters
are allowed. For more information about file streams, see File
Streams.

Categories