What I want to do is read every URL from a file and scrape this URL. After that, I will move the scraping data to the class WebRealTor and then serialize data in json and finally save all the data in a json file.
This is the content of the file:
https://www.seloger.com/annonces/achat/appartement/paris-14eme-75/montsouris-dareau/143580615.htm?ci=750114&idtt=2,5&idtypebien=2,1&LISTING-LISTpg=8&naturebien=1,2,4&tri=initial&bd=ListToDetail
https://www.seloger.com/annonces/achat/appartement/montpellier-34/gambetta/137987697.htm?ci=340172&idtt=2,5&idtypebien=1,2&naturebien=1,2,4&tri=initial&bd=ListToDetail
https://www.seloger.com/annonces/achat/appartement/montpellier-34/celleneuve/142626025.htm?ci=340172&idtt=2,5&idtypebien=1,2&naturebien=1,2,4&tri=initial&bd=ListToDetail
https://www.seloger.com/annonces/achat/appartement/versailles-78/domaine-national-du-chateau/138291887.htm
And my script is:
import scrapy
import json
class selogerSpider(scrapy.Spider):
name = "realtor"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
}
def start_requests(self):
with open("annonces.txt", "r") as file:
for line in file.readlines():
yield scrapy.Request(line)
def parse(self, response):
name = response.css(".agence-link::text").extract_first()
address = response.css(".agence-adresse::text").extract_first()
XPATH_siren = ".//div[#class='legalNoticeAgency']//p/text()"
siren = response.xpath(XPATH_siren).extract_first()
XPATH_website = ".//div[#class='agence-links']//a/#href"
site = response.xpath(XPATH_website).extract()
XPATH_phone = ".//div[#class='contact g-row-50']//div[#class='g-col g-50 u-pad-0']//button[#class='btn-phone b-btn b-second fi fi-phone tagClick']/#data-phone"
phone = response.xpath(XPATH_phone).extract_first()
yield {
'Agency_Name =': name,
'Agency_Address =': address,
'Agency_profile_website =': site,
'Agency_number =': phone,
'Agency_siren =': siren
}
file.close()
class WebRealTor:
def __name__(self):
self.nom = selogerSpider.name
def __address__(self):
self.adress = selogerSpider.address
def __sirenn__(self):
self.sire = selogerSpider.siren
def __numero__(self):
self.numero = selogerSpider.phone
with open('data.txt', 'w') as outfile:
json.dump(data, outfile)
Try to move everything to start_requests in you class. Like this:
def start_requests(self):
with open("annonces.txt", "r") as file:
for line in file.readlines():
yield scrapy.Request(line) # self.parse is by default
def parse(self, response):
# each link parsing as you already did
Related
I'm trying to save image urls for individual properties in their respective csv files via feeds export, in order for this to work, the FEEDS csv_path in custom_settings will have to be changed every time a scrapy.Request is yielded in start_requests. Every time a scrapy.Request is yielded, the self.get_csv_path in __init__ is assigned a new csv file path correspondent to the property id, it is then fetched to FEEDS by def get_feeds_csv_path as in the code below. The self.feeds_csv_path in custom_settings doesn't seem to be able to access def get_feeds_csv_path, where is the error here?
import asyncio
from configparser import ConfigParser
import os
import pandas as pd
import scrapy
import requests
import json
class GetpropertyimgurlsSpider(scrapy.Spider):
name = 'GetPropertyImgUrls'
custom_settings = {
"FEEDS": {
self.feeds_csv_path: {
"format": "csv",
"overwrite": True
}
}
}
def __init__(self, *args, **kwargs):
self.feeds_csv_path = None
super(GetpropertyimgurlsSpider, self).__init__(*args, **kwargs)
def start_requests(self):
files = self.get_html_files() # List of html file full paths
for file in files[:2]:
self.feeds_csv_path = self.get_feeds_csv_path(file)
yield scrapy.Request(file, callback=self.parse)
def parse(self, response):
texts = response.xpath("//text()").getall()
text = texts[1]
json_text = json.loads(text)
#print(text)
photos = json_text["#graph"][3]["photo"]
for photo in photos:
yield photo["contentUrl"]
def get_feeds_csv_path(self, html_file_path):
property_id = html_file_path.split("/")[-2].split("_")[1]
feeds_csv_path = f"{html_file_path}/images/Property_{property_id}_ImgSrcs.csv"
return feeds_csv_path
def get_path(self):
config = ConfigParser()
config.read("config.ini") # Location relative to main.py
path = config["scrapezoopla"]["path"]
return path
#Returns a list of html file dirs
def get_html_files(self):
path = self.get_path()
dir = f"{path}/data/properties/"
dir_list = os.listdir(dir)
folders = []
for ins in dir_list:
if os.path.isdir(f"{dir}{ins}") == True:
append_ins = folders.append(ins)
html_files = []
for folder in folders:
html_file = f"{dir}{folder}/{folder}.html"
if os.path.isfile(html_file) == True:
append_html_file = html_files.append(f"file:///{html_file}")
return html_files
The first problem I see is that you are using the self keyword in the namespace scope of your spider class. The self keyword is only available inside of instance methods where you pass the keyword in as the first argument. e.g. def __init__(self...).
Even if self was available it still wouldn't work though, because once you create the custom_settings dictionary, the self.feeds_csv_path is immediately converted to it's string value at runtime, so updating the instance variable would have no effect on the custom_settings propert.
Another issue is that scrapy collects all of the custom settings and stores them internally before the crawl is actually started, and updating the custom_settings dictionary mid crawl might not actually have an effect. I am not certain about that though.
All of that being said, your goal is still achievable. One means that I can think of is by creating the FEEDS dictionary runtime but prior to initiating the crawl and filtering using custom scrapy.Item classes to filter which item belongs to which output.
I have no way of testing it so it might be buggy but here is an example of what I am referring to:
from configparser import ConfigParser
import json
import os
import scrapy
def get_path():
config = ConfigParser()
config.read("config.ini") # Location relative to main.py
path = config["scrapezoopla"]["path"]
return path
#Returns a list of html file dirs
def get_html_files():
path = get_path()
folder = f"{path}/data/properties/"
dir_list = os.listdir(folder)
html_files = []
for ins in dir_list:
if os.path.isdir(f"{folder}{ins}"):
if os.path.isfile(f"{folder}{ins}/{ins}.html"):
html_files.append(f"file:///{folder}{ins}/{ins}.html")
return html_files
def get_feeds_csv_path(self, html_file_path):
property_id = html_file_path.split("/")[-2].split("_")[1]
feeds_csv_path = f"{html_file_path}/images/Property_{property_id}_ImgSrcs.csv"
return feeds_csv_path
def create_custom_item():
class Item(scrapy.Item):
contentUrl = scrapy.Field()
return Item
def customize_settings():
feeds = {}
files = get_html_files()
start_urls = {}
for path in files:
custom_class = create_custom_item()
output_path = get_feeds_csv_path(path)
start_urls[path] = custom_class
feeds[output_path] = {
"format": "csv",
"item_classes": [custom_class],
}
custom_settings = {"FEEDS": feeds}
return custom_settings, start_urls
class GetpropertyimgurlsSpider(scrapy.Spider):
name = 'GetPropertyImgUrls'
custom_settings, start_urls = customize_settings()
def start_requests(self):
for uri, itemclass in self.start_urls.items():
yield scrapy.Request(uri, callback=self.parse, cb_kwargs={'itemclass': itemclass})
def parse(self, response, itemclass):
texts = response.xpath("//text()").getall()
text = texts[1]
json_text = json.loads(text)
photos = json_text["#graph"][3]["photo"]
for photo in photos:
item = itemclass()
item['contentUrl'] = photo["contentUrl"]
yield item
I am creating a python program using scrapy that crawls a given domain and when it finds pdf's, it will scan them for information (location of pdf, num of pages, image count, field count, tagged, etc) and place all of this into a CSV file.
It downloads all the pdf's just fine, but when I open the csv file, only a fraction of the files downloaded are in the file. I'm not sure what I am doing wrong. I thought perhaps I was not properly closing the file once opened but I'm not sure that's the problem. Code is below:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
import urllib.parse as urlparse
import os.path
import validators
import csv
from .. info import isTagged
from .. get_metadata import get_data, count_images
from .. fieldCount import getFieldCount
class PdfspiderSpider(CrawlSpider):
name = 'pdfspider'
allowed_domain = input('Enter the domain name of the website to be crawled (domain of https://google.com is "google"): ')
allowed_domains = [allowed_domain]
#need domain to name folder pdfs will be put into
global domain
domain = allowed_domains[0]
global start
start = input('Enter the url of the page you wish to start the crawl on (include http/https): ')
start_urls = [start]
global base_path
base_path = input('Where do you wish to save the folder containing the pdfs?: ')
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
base_url = start
for a in response.xpath('//a[#href]/#href'):
link = a.extract()
if link.endswith('.pdf'):
link = urlparse.urljoin(base_url, link)
yield Request(link, callback=self.save_pdf)
def create_csv(self):
header = ['location', 'title', 'author', '# of pages', 'tagged?', 'field count', 'image count']
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, 'x')
writer = csv.writer(f)
writer.writerow(header)
f.close()
def save_pdf(self, response):
url=response.url
if response.status == 200:
save_dir = base_path + '/' + domain
isExist = os.path.exists(save_dir)
if not isExist:
# Create a new directory because it does not exist
os.makedirs(save_dir)
csvFile = domain + '.csv'
csvPath = save_dir + '/' + csvFile
csvPathExist = os.path.exists(csvPath)
if not csvPathExist:
self.create_csv()
file = response.url.split('/')[-1]
full_path = os.path.join(save_dir, file)
with open(full_path, 'wb') as f:
f.write(response.body)
is_tagged = isTagged(full_path)
metaData = get_data(full_path)
fieldCount = getFieldCount(full_path)
imageCount = count_images(full_path)
row = [url, metaData[0], metaData[1], metaData[2], is_tagged, fieldCount, imageCount]
self.add_to_csv(row)
f.close()
else:
print(f"Failed to load pdf: {url}")
def add_to_csv(self,row):
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, 'a', newline='')
writer = csv.writer(f)
writer.writerow(row)
f.close()
So I think its the function "add_to_csv" thats the problem, but I can't figure out why. Any help woul be appreciated.
The issue is when you are calling
self.add_to_csv(row) method inside save_pdf() method
After calling this you are closing the file, which makes to write incomplete information to csv. What you can do is, put your code in try except clause and close all files in finally block.
Nothing wrong with the logic in add_to_csv() method.
I am trying to write a class that will take a string variable in the constructor and will validate the file path and if valid will load the file and populate variables with the content and also the class to have getters for each var.
Here is the json file:
{
"name": [
"Caadi",
"Iskadhig"
],
"location": 20356,
"job": "Engineer",
"address": [
{
"city": "Swindon",
"county": [
"Avon"
]
}
]
}
I have attempted so far the following code:
import json
import os.path
class Config:
def __init__(self, file_name,name,location,job,address):
self.file_name = file_name
self.name = name
self.location = location
self.job = job
self.address = address
try:
if os.path.exists(
'/CaseConfig.json'): # validate the file path
with open('/CaseConfig.json', 'r') as file:
json_file_data = file.read() # read the content of file
self.file_name.get_json_content_file = json.loads(json_file_data) # the content of file
else:
print("File doesn't exist please check the path")
except Exception as e:
print("File not accessible", e)
def getName(self):
return self.name
def getLocation(self):
return self.location
def getJob(self):
return self.job
def getAddress(self):
return self.address
obj = Config('file_name', 'name', 'location', 'job', 'address')
I am stuck and not sure why I am getting the following error:
File not accessible 'str' object has no attribute 'get_json_content_file'
Your JSON file has new lines, you must get rid of them.
Try the following:
json_file_data = file.read().replace("\n","")
if you read a file file.read() it will at least in this casse be converted into a string. in order to properly read a JSON file you want to do something like this
with open("your file nane.json", "r") as file:
data = json.load(file)
in your case data will be
{'name': ['Caadi', 'Iskadhig'], 'location': 20356, 'job': 'Engineer', 'address': [{'city': 'Swindon', 'county': ['Avon']}]}
you can than read the data out of this dictionary in the same way you would any other
I wanted to scrape a few pdfs from a great history crash course I used to read a long time ago. Sadly, the old website is down and I only managed to get the old html code from archive.org
(the links I got work fine, ex: https://drive.google.com/file/d/0BzRJiIvdbSoKcHpGUWJBUDZ2WDA/edit?usp=sharing).
This script is resulting in html files being downloaded, saying
,,We're sorry but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.”
Is there a way to bypass this? I tried putting a few random delays into the code so this might be insufficient or i might be on google's blacklist for now.
(the text.txt file can be found here https://filebin.net/k2qw09embamx05ey )
import requests
import time
import random
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
time.sleep(random.randrange(1,2))
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f = open('text.txt')
long_string = f.readlines()
interesting_strings = []
for item in long_string:
if 'drive.google' in item:
interesting_strings.append(item)
print(interesting_strings)
interesting_strings = interesting_strings[0]
interesting_strings = interesting_strings.split('https://web.archive.org/web/20161219093036/')
links = []
for item in interesting_strings:
if 'drive.google' in item:
idx = item.find('"')
links.append(item[:idx])
cntr = 1
for link in links:
print(link)
fname = './data/History_' + str(cntr)
file_id = link.split('/')[-2]
print('id:', file_id)
destination = fname
download_file_from_google_drive(file_id, destination)
print('Getting file #', str(cntr))
cntr += 1
time.sleep(random.randrange(3,15) + random.random())
Use gdown:
import gdown
file_id = '0BzRJiIvdbSoKcHpGUWJBUDZ2WDA'
filename = 'file.pdf'
url = 'https://drive.google.com/uc?id=' + file_id
gdown.download(url, filename, quiet=False)
Here My code which i made:
import json
from pyspider.libs.base_handler import *
f = open("demo.txt","w")
class Handler(BaseHandler):
crawl_config = {
}
#every(minutes=0,seconds = 0)
def on_start(self):
self.crawl('Any URL', callback=self.index_page)
#config(priority=2)
def detail_page(self, response):
img = ""
for each in response.doc("img[id='landingImage']").items():
img = each
f.write("\n"+response.doc('title').text() + "\t" + response.doc("span[id^='priceblock']").text())
return {
"url": response.url,
"title": response.doc('title').text(),
"img_url":img.attr['src'],
"Price":response.doc("span[id^='priceblock']").text(),
"Availability":response.doc("div[id='availability']").text()
}
In above code i have scrap data from link but i can not get output in json or csv format
You can download the data from webui, or append the data to a file by override on_result method.