import requests
from bs4 import BeautifulSoup
url = input("URL:")
grab_page = requests.get(url)
parse_page = BeautifulSoup(grab_page.text, "html.parser")
file_name = parse_page.title.string.replace("\\,()", "")
newfile = open(file_name + ".html", "w+")
newfile.write(grab_page.text)
When I try to run the above code, with this particular URL, where the title of webpage is "How to Install JDK 8 (on Windows,
Mac OS, Ubuntu) and Get Started with Java Programming" I received the following error:
Traceback (most recent call last):
File "C:/Users/LKT/PycharmProjects/webpagegrabber/main.py", line 12, in <module>
newfile = open(file_name + ".html", "w+")
OSError: [Errno 22] Invalid argument: 'How to Install JDK 8 (on Windows,\r\nMac OS, Ubuntu)
and Get Started with Java Programming.html'
Where did I go wrong?
Your file name contains invalid characters (\n, \r). So you cannot create such a file in Windows. As described in the Windows Developer Center:
Characters whose integer representations are in the range from 1
through 31, except for alternate data streams where these characters
are allowed. For more information about file streams, see File
Streams.
Related
I have some txt files in a folder, I have listed directory and garbed links. After visit links using selenium I have taken screenshot. Now I am trying do delete this link txt file.
Below code I have tried
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
import os
path = "E:/xampp/htdocs/spool/"
directories = os.listdir(path)
for dir in directories:
# print(dir)
files = os.listdir(path+dir)
for file in files:
# print(path+dir+'/'+file)
f = open(path+dir+'/'+file, "r")
list = f.read()
data = list.split("||")
print(data[1])
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(data[1])
driver.save_screenshot(data[0]+'.png')
driver.close()
os.unlink(f.name)
Problem is unlink time it's giving below error
Traceback (most recent call last):
File "index.py", line 21, in <module>
os.unlink(f.name)
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'E:/xampp/htdocs/spool/7/2020-09-1112.txt'
I have also used os.close(3), After that got error "
list = f.read()
OSError: [Errno 9] Bad file descriptor
"
How can I unlink after take screenshot ?
Version python : Python 3.8.4
As u can see another process is using the txt file.
I think that's the problem here; you opened the file and didn't closed it.
I suggets you to visit https://www.tutorialspoint.com/python/file_close.htm
Try to call f.close() and then unlink.
This method of file handling is not entirely safe.
If an exception occurs when we are performing some operation with the file, the code exits with out closing the file.
In your case you forgot to close the file.
f.close()
I would recommend to use this approach to avoid such scenarios
with open("test.txt", mode= 'r', encoding = 'utf-8') as f:
# perform file operations
pass
# we dont need to explicitly close() the method, it is done internally.
i am trying to download all the pdfs from the webiste provided and i am using the following code:
import mechanize
from time import sleep
br = mechanize.Browser()
br.open('http://www.nerc.com/comm/CCC/Pages/AgendasHighlightsandMinutes-.aspx')
f=open("source.html","w")
f.write(br.response().read())
filetypes=[".pdf"]
myfiles=[]
for l in br.links():
for t in filetypes:
if t in str(l):
myfiles.append(l)
def downloadlink(l):
f=open(l.text,"w")
br.click_link(l)
f.write(br.response().read())
print l.text," has been downloaded"
for l in myfiles:
sleep(1)
downloadlink(l)
keep on getting the following error and can't figure out the problem why.
legal and privacy has been downloaded
Traceback (most recent call last):
File "downloads-pdfs.py", line 29, in <module>
downloadlink(l)
File "downloads-pdfs.py", line 21, in downloadlink
f=open(l.text,"w")
IOError: [Errno 13] Permission denied: u'/trademark policy'
The problem you encounter arises because you use the link URL as a filename. The character '/' is not valid in a filename. Try to modify your downloadlink function to something like this:
def downloadlink(l):
filename = l.text.split('/')[-1]
with open(filename, "w") as f:
br.click_link(l)
f.write(br.response().read())
print l.text," has been downloaded"
I am a beginner in python3
I want to copy snippet java file in the middle of other temp file which i gain the address of this file from downloading URL.
my problem is when i execute my program i have this error:
RESTART: C:/Users/user/AppData/Local/Programs/Python/Python36/refactordwon.py
the Url is:
('C:\\Users\\user\\AppData\\Local\\Temp\\tmpq5m7m_og', <http.client.HTTPMessage object at 0x0000003A854879E8>)
Traceback (most recent call last):
File "C:/Users/user/AppData/Local/Programs/Python/Python36/refactordwon.py", line 14, in <module>
file_out = open("path_file" , "r")
FileNotFoundError: [Errno 2] No such file or directory: 'path_file'
>>>
i do not know why?
because when i download the url, url shows me this address:
the Url is:
('C:\\Users\\user\\AppData\\Local\\Temp\\tmpey3yovte', <http.client.HTTPMessage object at 0x0000002233347978>)
i tried to use this address in different way but anyway i have error. i found the temp file and copied in python address.
I am sure that i have this file and the address is correct but again i have error that can not find file.
could you help me, please?!
I hope my question is clear
my code is:
import urllib.request
import os
import tempfile
#download URL
#[-------------------------
url = 'http://pages.di.unipi.it/corradini/Didattica/AP-17/PROG-ASS/03/assignment3.html'
gt_url = urllib.request.urlretrieve(url)
print("the Url is: ")
print(gt_url)
#--------------------------]
#copy sniper java file inside remote file
#[--------------------------
path_file =r'C:/Users/user/AppData/Local/Programs/Python/Python36/tmpokv2s_dw'
file_out = open("path_file" , "r")
file_in = open("snip1.java", "r")
file_out.readlines()
open("file_back", "w")
file_back.write(file_out)
pos_fileout = file_back.tell()
file_back.seek(pos_fileout)
file_back.write(file_in)
print("the content of file is: ")
file_back.close()
file_out.close()
file_in.close()
open("file_back", "r")
file_back.readlines()
print(file_back)
file_back.close()
I suspect this is a very newbie question but I can't find any solutions that are helping :(
I've been trying to get started with Python by building a simple Twitter bot which replies to people who tweet at it. It worked locally, and it doesn't work on Heroku.
A quick rundown: Each time the bot tweets, it uses a script called mainscript.py which writes the ID of the last tweet replied to into a separate file called lastid.py. The next time the script runs, it opens lastid.py, checks the number inside against the current list of tweets, and only responds to those with a larger ID number than the one stored in lastid.py.
fp = open("lastid.py", 'r')
last_id_replied = fp.read()
fp.close()
#(snipped - the bot selects the tweet and sends it here...)
fp = open("lastid.py", 'w')
fp.write(str(status.id))
fp.close()
This works great locally. Runs fine. However, when I upload it to Heroku I get this error:
Traceback (most recent call last):
File "/app/workspace/mainscript.py", line 60, in <module>
fp = open("lastid.py", 'r')
IOError: [Errno 2] No such file or directory: u'lastid.py'
I am absolutely 100% positive lastid.py and mainscript.py are on the server and inside the same directory - I have triple-checked this by running bash on heroku. My .gitignore file is blank so it isn't anything to do with that.
I don't understand why such a simple command as 'open a file in the same directory and read it' doesn't work on the server. What on earth have I done wrong?
(I realise I should have worked through some tutorials before trying to build something custom in a new language, but now I've started this I'd really love to finish it - any help anyone can offer would be very much appreciated.)
Probably the python interpreter is being executed from a different directory than where your script lives.
Here's the same setup:
oliver#aldebaran /tmp/junk $ cat test.txt
a
b
c
baseoliver#aldebaran /tmp/junk $ cat sto.py
with open('test.txt', 'r') as f:
for line in f:
print(line)
baseoliver#aldebaran /tmp/junk $ python sto.py
a
b
c
baseoliver#aldebaran /tmp/junk $ cd ..
baseoliver#aldebaran /tmp $ python ./junk/sto.py
Traceback (most recent call last):
File "./junk/sto.py", line 1, in <module>
with open('test.txt', 'r') as f:
IOError: [Errno 2] No such file or directory: 'test.txt'
To solve this, import os and use absolute pathnames:
import os
MYDIR = os.path.dirname(__file__)
with open(os.path.join(MYDIR, 'test.txt')) as f:
pass
# and so on
I'm trying to create a python script to check if a host is alive, if so, download the website into a results/ directory. Once I learn how to do this I will branch out on figuring out how to spider and launch other subprocesses (such as launching nikto/skipfish after checking is complete and loading the saved file).
#! /usr/bin/python
import os
import sys
import urllib
import urllib2
import subprocess
# Where the magic happens
str1 = raw_input("Enter your target: ")
print "Target = ", str1
print "commencing testing on", str1
# Let's set the user-agent headers
http_headers = {"User-Agent":"Mozilla/5.0"}
request = urllib2.Request(str1)
response = urllib2.urlopen(request)
payload = response.read()
dir_path = os.path.join(self.results)
os.makedirs(dir_path)
**with open(os.join.path(dir_path, 'index.html', 'wb') as file:
file.write(payload)
print str1, "index written to file"**
# Send an email to notify us when complete
var = "world"
pipe = subprocess.Popen(["./email.sh", var], stdout=subprocess.PIPE)
result = pipe.stdout.read()
print result
I receive the following error message:
File "./webtest.py", line 43
with open(os.join.path(dir_path, 'index.html', 'wb') as file:
^
SyntaxError: invalid syntax
Error after closing the parenthesis (from Phil's Answer):
Traceback (most recent call last):
File "./webtest.py", line 41, in <module>
dir_path = os.path.join(self.results)
NameError: name 'self' is not defined
You missed a parentheses:
with open(os.join.path(dir_path, 'index.html', 'wb')) as file:
EDIT
That line has to do with the directory that you want. It's giving errors because you're not in a class (so "self" doesn't exist). The best course of action would be to replace it with just "results" and specify where results are. For example:
results = "/resultsdir/"
dir_path = os.path.join(results)