scrapy reading urls from a txt file fail - python

This is how the txt file looks like, and I opened it from jupiter notebook. Notice that I changed the name of the links in the result for obvious reason.
input-----------------------------
with open('...\j.txt', 'r')as f:
data = f.readlines()
print(data[0])
print(type(data))
output---------------------------------
['https://www.example.com/191186976.html', 'https://www.example.com/191187171.html']
Now I wrote these in my scrapy script, it didn't go for the links when I ran it. Instead it shows: ERROR: Error while obtaining start requests.
class abc(scrapy.Spider):
name = "abc_article"
with open('j.txt' ,'r')as f4:
url_c = f4.readlines()
u = url_c[0]
start_urls = u
And if I wrote u = ['example.html', 'example.html'] starting_url = u then it works perfectly fine. I'm new to scrapy so I'd like to ask what is the problem here? Is it the reading method or something else I didn't notice. Thanks.

Something like this should get you going in the right direction.
import csv
from urllib.request import urlopen
#import urllib2
from bs4 import BeautifulSoup
contents = []
with open('C:\\your_path_here\\test.csv','r') as csvf: # Open file in read mode
urls = csv.reader(csvf)
for url in urls:
contents.append(url) # Add each url to list contents
for url in contents: # Parse through each url in the list.
page = urlopen(url[0]).read()
soup = BeautifulSoup(page, "html.parser")
print(soup)

Related

Created web scraping program in Python, need to save weblinks to csv and remove duplicates

I have been able to successfully scrape the website and am having some trouble saving as a csv and need help to see where I have messed up. Here is my code and I have also included a snippet of my code:
import bs4 as BeautifulSoup
import CSV
import re
import urllib.request
from IPython.display import HTML
# Program that scraps the website for
r= urllib.request.urlopen('https://www.census.gov/programs-
surveys/popest.html').read()
soup = BeautifulSoup(r,"html.parser")
for link in soup.find_all('a'):
print(link.get('href'))
with open("Giles_C996.csv","w") as csv_file:
writer = csv.writer(csv_file,delimiter="/n")
writer.writerow(Links)
Close()
Error message:
Traceback (most recent call last):
File "C:\Users\epiph\Giles_C996 Project 2.txt", line 2, in
import CSV
ModuleNotFoundError: No module named 'CSV'
My Code
You've incorrectly imported csv and bs4 modules. Also Close() is incorrect. And you can use conversion to set to get rid of duplicates.
import csv
import urllib.request
from bs4 import BeautifulSoup
r = urllib.request.urlopen('https://www.census.gov/programs-surveys/popest.html').read()
soup = BeautifulSoup(r, "html.parser")
links = set([a['href'] for a in soup.find_all('a', href=True)])
with open("Giles_C996.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerows([link] for link in links)
Output is:
https://www.census.gov/programs-surveys/cps.html
/newsroom/press-releases/2020/65-older-population-grows/65-older-population-grows-spanish.html
https://www.census.gov/businessandeconomy
https://www.census.gov/data
/programs-surveys/popest/library.html
etc.
You had some erroneous imports and called an undefined variable.
I'm not very familiar with iPython so I can't comment much on your use of it. And always have trouble with urllibs so I just used requests.
I included some scrap code for an alternative layout for the csv file, as well as a function which can help determine if a link is valid, and a list comprehension in case you prefer that approach.
Also opens your csv file for you.
import csv, re, urllib.request, os
import requests
from bs4 import BeautifulSoup
# from IPython.display import HTML
def exists(link) -> bool:
"""
Check if request response is 200
"""
try:
return 200 == requests.get(link).status_code
except requests.exceptions.MissingSchema:
return False
except requests.exceptions.InvalidSchema:
return False
def scrapeLinks(url):
checked = set()
page = requests.get(url).text
soup = BeautifulSoup(page,"html.parser")
for a in soup.find_all('a',href=True):
link = a['href']
if not link in checked and exists(link):
yield link
checked.add(link)
# Program that scrapes the website for
url = 'https://www.census.gov/programs-surveys/popest.html'
# r = urllib.request.urlopen(url).read()
r = requests.get(url).text
soup = BeautifulSoup(r,"html.parser")
# links = [
# a['href'] for a in soup.find_all('a',href=True)\
# if exists(a['href'])
# ]
file_name = "Giles_C996.csv"
with open(file_name,"w") as csv_file:
# writer = csv.writer(csv_file),delimiter="/n")
writer = csv.writer(csv_file)
# writer.writerow(set(links)) # conversion to remove duplicates
writer.writerow(scrapeLinks(url))
# writer.writerows(enumerate(scrapeLinks(url),1)) ## if you want a 2d-indexed collection
os.startfile(file_name)
# Close()

requests.get(url) in python behaving differently when used in loop

I'm new in python programming and trying to scrape every link available in my Urls.txt file.
the code I wrote is :
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
user_agent = UserAgent()
fp = open("Urls.txt", "r")
values = fp.readlines()
fin = open("soup.html", "a")
for link in values:
print( link )
page = requests.get(link, headers={"user-agent": user_agent.chrome})
html = page.content
soup = BeautifulSoup(html, "html.parser")
fin.write(str(soup))
The code works absolutely fine when the links are provided directly as string instead of as variable but when used as it is the output differs.
Maybe the string you read from the file has a line break.
To remove it use link.strip("\n")

Parse URL beautifulsoup

import requests
import csv
from bs4 import BeautifulSoup
page = requests.get("https://www.google.com/search?q=cars")
soup = BeautifulSoup(page.content, "lxml")
import re
links = soup.findAll("a")
with open('aaa.csv', 'wb') as myfile:
for link in soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)")):
a = (re.split(":(?=http)",link["href"].replace("/url?q=","")))
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(a)
The output of this code is that I have a CSV file where 28 URLs are saved, however the URLs are not correct. For example this is a wrong URL:-
http://www.imdb.com/title/tt0317219/&sa=U&ved=0ahUKEwjg5fahi7nVAhWdHsAKHSQaCekQFgg9MAk&usg=AFQjCNFu_Vg9v1oVhEtR-vKqCJsR2YGd2A
Instead it should be:-
http://www.imdb.com/title/tt0317219/
How can I remove the second part for each URL if it contains "&sa="
Because then the second part of the URL starting from:-
"&sa=" should be removed, so that all URLs are saved like the second URL.
I am using python 2.7 and Ubuntu 16.04.
If every time redundant part of url starts with &, you can apply split() to each url:
url = 'http://www.imdb.com/title/tt0317219/&sa=U&ved=0ahUKEwjg5fahi7nVAhWdHsAKHSQaCekQFgg9MAk&usg=AFQjCNFu_Vg9v1oVhEtR-vKqCJsR2YGd2A'
url = url.split('&')[0]
print(url)
output:
http://www.imdb.com/title/tt0317219/
Not the best way, but you could do one more time split, adding one more line after a:
a=[a[0].split("&")[0]]
print(a)
Result:
['https://de.wikipedia.org/wiki/Cars_(Film)']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:I2SHYtLktRcJ']
['https://de.wikipedia.org/wiki/Cars_(Film)%23Handlung']
['https://de.wikipedia.org/wiki/Cars_(Film)%23Synchronisation']
['https://de.wikipedia.org/wiki/Cars_(Film)%23Soundtrack']
['https://de.wikipedia.org/wiki/Cars_(Film)%23Kritik']
['https://www.mytoys.de/disney-cars/']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:9Ohx4TRS8KAJ']
['https://www.youtube.com/watch%3Fv%3DtNmo09Q3F8s']
['https://www.youtube.com/watch%3Fv%3DtNmo09Q3F8s']
['https://www.youtube.com/watch%3Fv%3DkLAnVd5y7M4']
['https://www.youtube.com/watch%3Fv%3DkLAnVd5y7M4']
['http://cars.disney.com/']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:1BoR6M9fXwcJ']
['http://cars.disney.com/']
['http://cars.disney.com/']
['https://www.whichcar.com.au/car-style/12-cartoon-cars']
['https://www.youtube.com/watch%3Fv%3D6JSMAbeUS-4']
['http://filme.disney.de/cars-3-evolution']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:fO7ypFFDGk0J']
['http://www.4players.de/4players.php/spielinfonews/Allgemein/36859/2169193/Project_CARS_2-Zehn_Ferraris_erweitern_den_virtuellen_Fuhrpark.html']
['http://www.4players.de/4players.php/spielinfonews/Allgemein/36859/2169193/Project_CARS_2-Zehn_Ferraris_erweitern_den_virtuellen_Fuhrpark.html']
['http://www.play3.de/2017/08/02/project-cars-2-6/']
['http://www.imdb.com/title/tt0317219/']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:-xdXy-yX2fMJ']
['http://www.carmagazine.co.uk/']
['http://webcache.googleusercontent.com/search%3Fq%3Dcache:PRPbHf_kD9AJ']
['http://google.com/search%3Ftbm%3Disch%26q%3DCars']
['http://www.imdb.com/title/tt0317219/']
['https://de.wikipedia.org/wiki/Cars_(Film)']

How can i call URL's from text file one by one

I want to parse on one website with some URL's and i created a text file has all links that i want to parse. How can i call this URL's from the text file one by one on python program.
from bs4 import BeautifulSoup
import requests
soup = BeautifulSoup(requests.get("https://www.example.com").content, "html.parser")
for d in soup.select("div[data-selenium=itemDetail]"):
url = d.select_one("h3[data-selenium] a")["href"]
upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum")
if upc:
data = json.loads(d["data-itemdata"])
text = (upc.text.strip())
print(upc.text)
outFile = open('/Users/Burak/Documents/new_urllist.txt', 'a')
outFile.write(str(data))
outFile.write(",")
outFile.write(str(text))
outFile.write("\n")
outFile.close()
urllist.txt
https://www.example.com/category/1
category/2
category/3
category/4
Thanks in advance
Use a context manager :
with open("/file/path") as f:
urls = [u.strip('\n') for u in f.readlines()]
You obtain your list with all urls in your file and can then call them as you like.

Open links from txt file in python

I would like to ask for help with a rss program. What I'm doing is collecting sites which are containing relevant information for my project and than check if they have rss feeds.
The links are stored in a txt file(one link on each line).
So I have a txt file with full of base urls what are needed to be checked for rss.
I have found this code which would make my job much easier.
import requests
from bs4 import BeautifulSoup
def get_rss_feed(website_url):
if website_url is None:
print("URL should not be null")
else:
source_code = requests.get(website_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.find_all("link", {"type" : "application/rss+xml"}):
href = link.get('href')
print("RSS feed for " + website_url + "is -->" + str(href))
get_rss_feed("http://www.extremetech.com/")
But I would like to open my collected urls from the txt file, rather than typing each, one by one.
So I have tryed to extend the program with this:
from bs4 import BeautifulSoup, SoupStrainer
with open('test.txt','r') as f:
for link in BeautifulSoup(f.read(), parse_only=SoupStrainer('a')):
if link.has_attr('http'):
print(link['http'])
But this is returning with an error, saying that beautifoulsoup is not a http client.
I have also extended with this:
def open()
f = open("file.txt")
lines = f.readlines()
return lines
But this gave me a list separated with ","
I would be really thankfull if someone would be able to help me
Typically you'd do something like this:
with open('links.txt', 'r') as f:
for line in f:
get_rss_feed(line)
Also, it's a bad idea to define a function with the name open unless you intend to replace the builtin function open.
i guess you can make it by using urllib
import urllib
f = open('test.txt','r')
#considering each url in a new line...
while True:
URL = f.readline()
if not URL:
break
mycontent=urllib.urlopen(URL).read()

Categories