Reading url from file Python - python

can not read the url in txt file
I want to read and open the url addresses in txt one by one, and I want to get the title of the title with regex from the source of url addresses
Error messages:
Traceback (most recent call last): File "Mypy.py", line 14, in
UrlsOpen = urllib2.urlopen(listSplit) File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout) File "/usr/lib/python2.7/urllib2.py", line 420, in open
req.timeout = timeout AttributeError: 'list' object has no attribute 'timeout'
Mypy.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import requests
import urllib2
import threading
UrlListFile = open("Url.txt","r")
UrlListRead = UrlListFile.read()
UrlListFile.close()
listSplit = UrlListRead.split('\r\n')
UrlsOpen = urllib2.urlopen(listSplit)
ReadSource = UrlsOpen.read().decode('utf-8')
regex = '<title.*?>(.+?)</title>'
comp = re.compile(regex)
links = re.findall(comp,ReadSource)
for i in links:
SaveDataFiles = open("SaveDataMyFile.txt","w")
SaveDataFiles.write(i)
SaveDataFiles.close()

When you are calling urllib2.urlopen(listSplit) listSplit is a list when it needs to be a string or request object. It's a simple fix to iterate over the listSplit instead of passing the entire list to urlopen.
Also re.findall() will return a list for each ReadSource searched. You can handle this a couple of ways:
I chose to handle it by just making a list of lists
websites = [ [link, link], [link], [link, link, link]
and iterating over both lists. This makes it so you can do something specific for each list of urls from each website (put in different file ect...).
You could also flatten the website list to just contain the links instead of another list that then contains the links:
links = [link, link, link, link]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import urllib2
from pprint import pprint
UrlListFile = open("Url.txt", "r")
UrlListRead = UrlListFile.read()
UrlListFile.close()
listSplit = UrlListRead.splitlines()
pprint(listSplit)
regex = '<title.*?>(.+?)</title>'
comp = re.compile(regex)
websites = []
for url in listSplit:
UrlsOpen = urllib2.urlopen(url)
ReadSource = UrlsOpen.read().decode('utf-8')
websites.append(re.findall(comp, ReadSource))
with open("SaveDataMyFile.txt", "w") as SaveDataFiles:
for website in websites:
for link in website:
pprint(link)
SaveDataFiles.write(link.encode('utf-8'))
SaveDataFiles.close()

Related

How to get file from url in python?

I want to download text files using python, how can I do so?
I used requests module's urlopen(url).read() but it gives me the bytes representation of file.
For me, I had to do the following (Python 3):
from urllib.request import urlopen
data = urlopen("[your url goes here]").read().decode('utf-8')
# Do what you need to do with the data.
You can use multiple options:
For the simpler solution you can use this
file_url = 'https://someurl.com/text_file.txt'
for line in urllib.request.urlopen(file_url):
print(line.decode('utf-8'))
For an API solution
file_url = 'https://someurl.com/text_file.txt'
response = requests.get(file_url)
if (response.status_code):
data = response.text
for line in enumerate(data.split('\n')):
print(line)
When downloading text files with python I like to use the wget module
import wget
remote_url = 'https://www.google.com/test.txt'
local_file = 'local_copy.txt'
wget.download(remote_url, local_file)
If that doesn't work try using urllib
from urllib import request
remote_url = 'https://www.google.com/test.txt'
file = 'copy.txt'
request.urlretrieve(remote_url, file)
When you are using the request module you are reading the file directly from the internet and it is causing you to see the text in byte format. Try to write the text to a file then view it manually by opening it on your desktop
import requests
remote_url = 'test.com/test.txt'
local_file = 'local_file.txt'
data = requests.get(remote_url)
with open(local_file, 'wb')as file:
file.write(data.content)

Search for a string in webpage and print the entire line containing it using python

I would like to search a webpage for a string and print the entire line containing that string.
I have a input file containing the links that i would like to search for that string.
String to be searched : "vcore"
My Input File:
http://abc/cluster/app/application_1447334090028_225490
http://abc/cluster/app/application_1447334090028_228858
Expected Output File:
http://abc/cluster/app/application_1447334090028_225490 12434 vcore, 123 mb
http://abc/cluster/app/application_1447334090028_228858 12132 vcore, 131 mb
Code so far:
import sys
import re
import urllib
Links = [Link.strip() for Link in open ('/home/try/Input.txt','r').readlines()]
for link in Links:
webPage = urllib.urlopen(link).read()
print webPage
Then i use grep to search for the string and store it in another file. But i want it to be done by the code itself and the line to appear next to the corresponding link. Can anyone help me on this?
lines = urllib.urlopen(link).readlines()
for line in lines:
if "vcore" in line:
print line
import re
import urllib
Links = [Link.strip() for Link in open ('/home/try/Urls.txt','r').readlines()]
for link in Links:
lines = urllib.urlopen(link).readlines()
for line in lines:
if "vcore" in line:
print link,line
Just having blank lines after every print statement

Parsing XML File with Python, while extracting Attributes and Children

I'm trying to read an XML file in Python whose general format is as follows:
<item id="1149" num="1" type="topic">
<title>Afghanistan</title>
<additionalInfo>Afghanistan</additionalInfo>
</item>
(This snippet repeats many times.)
I'm trying to get the id value and the title value to be printed into a file.
Currently, I'm having trouble with getting the XML file into Python. Currently, I'm doing this to get the XML file:
import xml.etree.ElementTree as ET
from urllib2 import urlopen
url = 'http://api.npr.org/list?id=3002' #1007 is science
response = urlopen(url)
f = open('out.xml', 'w')
f.write(response)
However, whenever I run this code, I get the error Traceback (most recent call last): File "python", line 9, in <module> TypeError: expected a character buffer object, which makes me think that I'm not using something that can handle XML.
Is there any way that I can save the XML file to a file, then extract the title of each section, as well as the id attribute associated with that title?
Thanks for the help.
You can read the content of response by this code :
import urllib2
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),urllib2.HTTPCookieProcessor())
response= opener.open("http://api.npr.org/list?id=3002").read()
opener.close()
and then write it to file :
f = open('out.xml', 'w')
f.write(response)
f.close()
What you want is response.read() not response. The response variable is an instance not the xml string. By doing response.read() it will read the xml from the response instance.
You can then write it directly to a file like so:
url = 'http://api.npr.org/list?id=3002' #1007 is science
response = urlopen(url)
f = open('out.xml', 'w')
f.write(response.read())
Alternatively you could also parse it directly into the ElementTree like so:
url = 'http://api.npr.org/list?id=3002' #1007 is science
response = urlopen(url)
tree = ET.fromstring(response.read())
To extract all of the id/title pairs you could do the following as well:
url = 'http://api.npr.org/list?id=3002' #1007 is science
response = urlopen(url)
tree = ET.fromstring(response.read())
for item in tree.findall("item"):
print item.get("id")
print item.find("title").text
From there you can decide where to store/output the values

Python urllib-html parse

Question about parsing web-site:
My code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import urllib2
import re
# Parse Web
from lxml import html
import requests
def parse():
try:
output = open('proba.xml','w')
page = requests.get('http://www.rts.rs/page/tv/sr/broadcast/22/RTS+1.html')
tree = html.fromstring(page.text)
parse = tree.xpath('//div[#class="ProgramTime"]/text()|//div[#class="ProgramName"]/text()|//a[#class="recnik"]/text()')
for line in parse:
clean = line.strip()
if clean:
print clean
except:
pass
parse()
My question is how can I write this result to file, when I try with this:
print >> output, line
I got only 6 first lines into file.
With this code:
output.write(line)
Same thing, so can you help me with this issue.
What I wanan is to output parsed content.
I am having trouble replicating the problem. Here is what I did...
import sys
import os
import urllib2
import re
from lxml import html
import requests
def parse():
output = open('proba.xml','w')
page = requests.get('http://www.rts.rs/page/tv/sr/broadcast/22/RTS+1.html')
tree = html.fromstring(page.text)
p = tree.xpath('//div[#class="ProgramTime"]/text()|//div[#class="ProgramName"]/text()|//a[#class="recnik"]/text()')
for line in p:
clean = line.strip()
if clean:
output.write(line.encode('utf-8')+'\n') # the \n adds a line break
output.close()
parse()
I think you are getting a unicode related error when writing to the file, but because you put everything in a try block and let the error pass silently you aren't getting feedback!
Try typing import this in a terminal. You will get the Zen of Python. One aphorism is "Errors should never pass silently."
Try this instead:
with file('proba.xml', 'w') as f:
f.writelines([line.strip() for line in parse]
Put this in place of for line in parse: clean = * and remove the declaration output = * above and no need for output.write again. Sorry if am not clearer am typing this on a mobile phone.

Extracting blog data in python

We have to extract a specified number of blogs(n) by reading them from a a text file containing a list of blogs.
Then I extract the blog data and append it to a file.
This is just a part of the main assignment of applying nlp to the data.
So far I've done this:
import urllib2
from bs4 import BeautifulSoup
def create_data(n):
blogs=open("blog.txt","r") #opening the file containing list of blogs
f=file("data.txt","wt") #Create a file data.txt
with open("blog.txt")as blogs:
head = [blogs.next() for x in xrange(n)]
page = urllib2.urlopen(head['href'])
soup = BeautifulSoup(page)
link = soup.find('link', type='application/rss+xml')
print link['href']
rss = urllib2.urlopen(link['href']).read()
souprss = BeautifulSoup(rss)
description_tag = souprss.find('description')
f = open("data.txt","a") #data file created for applying nlp
f.write(description_tag)
This code doesn't work. It worked on giving the link directly.like:
page = urllib2.urlopen("http://www.frugalrules.com")
I call this function from a different script where user gives the input n.
What am I doing wrong?
Traceback:
Traceback (most recent call last):
File "C:/beautifulsoup4-4.3.2/main.py", line 4, in <module>
create_data(2)#calls create_data(n) function from create_data
File "C:/beautifulsoup4-4.3.2\create_data.py", line 14, in create_data
page=urllib2.urlopen(head)
File "C:\Python27\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 395, in open
req.timeout = timeout
AttributeError: 'list' object has no attribute 'timeout'
head is a list:
head = [blogs.next() for x in xrange(n)]
A list is indexed by integer indices (or slices). You can not use head['href'] when head is a list:
page = urllib2.urlopen(head['href'])
It's hard to say how to fix this without knowing what the contents of blog.txt looks like. If each line of blog.txt contains a URL, then
you could use:
with open("blog.txt") as blogs:
for url in list(blogs)[:n]:
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
...
with open('data.txt', 'a') as f:
f.write(...)
Note that file is a deprecated form of open (which was removed in Python3). Instead of using f=file("data.txt","wt"), use the more modern with-statement syntax (as shown above).
For example,
import urllib2
import bs4 as bs
def create_data(n):
with open("data.txt", "wt") as f:
pass
with open("blog.txt") as blogs:
for url in list(blogs)[:n]:
page = urllib2.urlopen(url)
soup = bs.BeautifulSoup(page.read())
link = soup.find('link', type='application/rss+xml')
print(link['href'])
rss = urllib2.urlopen(link['href']).read()
souprss = bs.BeautifulSoup(rss)
description_tag = souprss.find('description')
with open('data.txt', 'a') as f:
f.write('{}\n'.format(description_tag))
create_data(2)
I'm assuming that you are opening, writing to and closing data.txt with each pass through the loop because you want to save partial results -- maybe in case the program is forced to terminate prematurely.
Otherwise, it would be easier to just open the file once at the very beginning:
with open("blog.txt") as blogs, open("data.txt", "wt") as f:

Categories