Need to optimize scraping code - select URL with parameters - python

This is a simple code for get url with search parameters. It actually works, but I think it needs to be optimized.
def target_url(search_term, include_term, intext_term, target_site_in, page):
base_template_0 = f'https://www.google.com/search?q={search_term}+"{include_term}"+intext:{intext_term}+site:{target_site_in}&hl=en&rlz='
base_template_1 = f'https://www.google.com/search?q={search_term}+"{include_term}"+intext:{intext_term}&hl=en&rlz='
base_template_2 = f'https://www.google.com/search?q={search_term}+"{include_term}"&hl=en&rlz='
base_template_3 = f'https://www.google.com/search?q={search_term}&hl=en&rlz='
search_term = search_term.replace(' ', '+')
base_url_0 = base_template_0.format(search_term)
base_url_1 = base_template_1.format(search_term)
base_url_2 = base_template_2.format(search_term)
base_url_3 = base_template_3.format(search_term)
url_template_0 = base_url_0 + '&start={}'
url_template_1 = base_url_1 + '&start={}'
url_template_2 = base_url_2 + '&start={}'
url_template_3 = base_url_3 + '&start={}'
if page == 0 and search_term and include_term and intext_term and target_site:
return base_url_0
if page == 0 and search_term and include_term and intext_term:
return base_url_1
if page == 0 and search_term and include_term:
return base_url_2
if page == 0 and search_term:
return base_url_3
else:
if search_term and include_term and intext_term and target_site:
return url_template_0.format(page)
if search_term and include_term and intext_term:
return url_template_1.format(page)
if search_term and include_term:
return url_template_2.format(page)
if search_term:
return url_template_3.format(page)
Four parameters are required: search_term, inclusion_term, input_term, target_site_in - In each case, a conditioned URL was specified differently.
Give me a better idea for optimization.

Instead of having multiple templates strings and selecting on them, you can make a method that gives you the final search query:
def get_search_query(search_term, include_term, intext_term, target_site_in):
response = search_term.replace(' ', '+')
if include_term:
response = f"{response}+{include_term}"
if intext_term:
response = f"{response}+intext:{intext_term}"
if target_site_in:
response = f"{response}+site:{target_site_in}"
return response
now in your method you can call it
def target_url(search_term, include_term, intext_term, target_site_in, page):
query = get_search_query(search_term, include_term, intext_term, target_site_in)
url = f'https://www.google.com/search?q={query}&hl=en&rlz='
if page != 0:
url = f"{url}&page={page}"
return url

Related

unable to interate a list

am trying to webscrape the information using selenium ,code is working for single item, but when am passing the list am getting the below output,
Actual Output
Expected output
term=["Atta","Sugar"]
def get_link(term,page):
for term in term:
pin(Pincode)
grocery="https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=GROCERY&as-show=on&as=off"
term = term.replace(' ', '+')
stem = grocery.format(term)
url_template = stem + '&as-pos=1&as-type=HISTORY&as-backfill=on&page='
next=url_template+str(page)
#print(next)
return next
def PID():
for page in range(1,5):
path=get_link(term,page)
driver.get(path)
id=driver.find_elements_by_xpath('//div[#data-id]')
for i in id:
results=i.get_attribute('data-id')
#print(results)
PIDs.append(results)
Search_Term.append(term)
PID()
ID={'Query':Search_Term,'PID_s':PIDs}
Output=pd.DataFrame(ID)
print(Output)
May be it would be better to put the for loop for term inside the PID function. Try like below once:
terms = ["Atta", "Sugar"]
def get_link(term, page):
# Not sure what pin(Pincode) line is doing
grocery = "https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=GROCERY&as-show=on&as=off"
term = term.replace(' ', '+')
#print(term)
stem = grocery.format(term)
url_template = stem + '&as-pos=1&as-type=HISTORY&as-backfill=on&page='
next = url_template + str(page)
# print(next)
return next
def PID():
for term in terms:
for page in range(1, 5):
path = get_link(term, page)
driver.get(path)
id = driver.find_elements_by_xpath('//div[#data-id]')
for i in id:
results = i.get_attribute('data-id')
print(f"{term}:{results}")
# PIDs.append(results)
# Search_Term.append(term)
PID()
Atta:FLRFDPRFNGYJ95KD
Atta:FLRETEFHENWKNJQE
...
Sugar:SUGG4SFGSP6TCQ48
Sugar:SUGEUD25B6YCCNGM
...

How to retrieve section IDs using Google docs API Python

For instance, we have a document such as this -
Table Of Content
Introduction
<text: A>
1.1 Background
<text: B>
1.2 Problem statement
<text: C>
Approach
<text: D>
2.1.1 Outline of the algorithm
<text: E>
I need to pattern match a "string" in all of the texts in the document. For example my search string could be "REQ-". Which could match "REQ-1", "REQ-2" to "REQ-10".
Suppose if "REQ-1" was located in text:C, and "REQ-2" in text:E, then the output I am looking for is
("REQ-1", "1.2"), ("REQ-2", "2.1.1") etc
Essentially, it matches the search string, identify all matches, and for each match, returns a 2-tuple of the matched string and the "section id" in the document containing the matched string.
def get_creds():
credentials = service_account.Credentials.from_service_account_file(
"cred_new.json", scopes=SCOPES
)
return credentials
def search_paragraph_element(element, search_str):
text_run = element.get('textRun')
if not text_run:
return False
res = text_run.get('content').find(search_str)
if res != -1:
return True
return False
def search_structural_elements(elements, search_str):
text = ''
hd_1 = 0
hd_2 = 0
hd_3 = 0
for value in elements:
if 'paragraph' in value:
if value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_1':
hd_1 = hd_1 + 1
hd_2 = 0
hd_3 = 0
elif value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_2':
hd_2 = hd_2 + 1
hd_3 = 0
elif value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_3':
hd_3 = hd_3 + 1
elements = value.get('paragraph').get('elements')
for elem in elements:
res = search_paragraph_element(elem, search_str)
if res is True:
return str(hd_1) + '.' + str(hd_2) + '.' + str(hd_3)
return text
def main():
"""Uses the Docs API to print out the text of a document."""
credentials = get_creds()
service = build("docs", "v1", credentials=credentials).documents()
properties = service.get(documentId=REQ_DOCUMENT_ID).execute()
doc_content = properties.get('body').get('content')
print(search_structural_elements(doc_content, "MySearchString"))
if __name__ == '__main__':
main()
``

How can i add to a list in python without creating it

I have the following django view for finding the average score of a questionnaire.
#login_required
def statistics(request, slug=False):
qn = get_object_or_404(Questionnaire, slug=slug)
questions = Question.objects.filter(questionnaire=qn).count()
qs = Question.objects.filter(questionnaire=qn)
responses = Response.objects.filter(question__in=qs, user=request.user).count()
if questions == 0 or responses == 0 or not questions <= responses:
return render(request, "questionnaire/stats.html")
out = {}
for q in qs:
response = Response.objects.filter(question=q, user=request.user).order_by("session_datetime").first()
out[q.category] = {}
time = response.session_datetime
time_string = time.strftime("%d/%m/%Y")
out[q.category][time_string] = []
responses_in_time = Response.objects.filter(question=q, user=request.user, session_datetime__gte=time,
session_datetime__lt=time + datetime.timedelta(hours=24))
for res in responses_in_time:
out[q.category][time_string] += [res.value]
print(out)
for category in out.keys():
print("outcat"+ str(out[category]))
for time in out[category].keys():
out[category][time] = sum(out[category][time])/len(out[category][time])
print(out)
return render(request, "questionnaire/stats.html", context={"questionnaire": qn, "stats_json": json.dumps(out)})
and i am wondering if there is a way to put together the dictionary with a time from the model/record in the loop without reseting it each time, if i try += without creating it first it complains but i do not know the initial time inside the loop.
ok so i dont know why i can only think of an answer just after ive posted but here is the working code. by looping through twice, once to create the dict and once to fill it i have solved this.
#login_required
def statistics(request, slug=False):
qn = get_object_or_404(Questionnaire, slug=slug)
questions = Question.objects.filter(questionnaire=qn).count()
qs = Question.objects.filter(questionnaire=qn)
responses = Response.objects.filter(question__in=qs, user=request.user).count()
if questions == 0 or responses == 0 or not questions <= responses:
return render(request, "questionnaire/stats.html")
out = {}
response = Response.objects.filter(question=qs.first(), user=request.user).order_by("session_datetime").first()
time = response.session_datetime
time_string = time.strftime("%d/%m/%Y")
for q in qs:
out[q.category] = {}
out[q.category][time_string] = []
for q in qs:
responses_in_time = Response.objects.filter(question=q, user=request.user, session_datetime__gte=time,
session_datetime__lt=time + datetime.timedelta(hours=24))
for res in responses_in_time:
out[q.category][time_string] += [res.value]
print(out)
for category in out.keys():
print("outcat"+ str(out[category]))
for time in out[category].keys():
out[category][time] = sum(out[category][time])/len(out[category][time])
print(out)
return render(request, "questionnaire/stats.html", context={"questionnaire": qn, "stats_json": json.dumps(out)})

Scraping view function remembers its previous iterations

I have the following view function used to scrape data:
def results(request):
if request.method == 'POST':
form = RoomForm(request.POST)
if form.is_valid():
form_city = form.cleaned_data['city'].title()
form_country = form.cleaned_data['country'].title()
form_arrival_date = form.cleaned_data['arrival_date']
form_departure_date = form.cleaned_data['departure_date']
form_pages_to_scrape = form.cleaned_data['pages_to_scrape']
#launch scraper
scraper = AIRBNB_scraper(city=form_city, country=form_country, arrival_date=str(form_arrival_date), departure_date=str(form_departure_date))
scraped_dataframe = scraper.scrape_multiple_pages(last_page_selector_number=form_pages_to_scrape)
scraped_dataframe_sorted = scraped_dataframe.sort_values('prices')
print(scraped_dataframe_sorted)
#convert scraped dataframe into lists
prices = scraped_dataframe_sorted['prices'].tolist()
listings_links = scraped_dataframe_sorted['listings_links'].tolist()
listings_names = scraped_dataframe_sorted['listings_names'].tolist()
photo_links = scraped_dataframe_sorted['photo_links'].tolist()
dictionary = zip(prices, listings_links, listings_names, photo_links)
context = {'dictionary': dictionary}
return render(request, 'javascript/results.html', context)
On form submit, a post request is sent to this function using AJAX:
var frm = $('#login-form');
frm.submit(function () {
$.ajax({
type: "POST",
url: "/results",
data: frm.serialize(),
success: function (data) {
$("#table").html(data);
$('#go_back').remove();
},
error: function(data) {
$("#table").html("Something went wrong!");
}
});
return false;
});
After that the scraped data is displayed as HTML table on the same page the form is on.
The problem is the number of scraped items doubles every time the form submit is done. So for example if the number of scraped items on first button click is sixteen, the output will be 16, but on the second run it will be 32, then 64, and so on.
It is like the app remembers previous form submits, but I don't see any reason why. I tried clearin - at the end of this function - the pandas dataframe used to store the scraped data and also the dictionary passed as context, but to no avail.
The form is:
class RoomForm(forms.Form):
city = forms.CharField(max_length=100)
country = forms.CharField(max_length=100)
arrival_date = forms.DateField(widget=forms.DateInput(attrs=
{
'class':'datepicker'
}), required=False)
departure_date = forms.DateField(widget=forms.DateInput(attrs=
{
'class':'datepicker'
}), required=False)
pages_to_scrape = forms.IntegerField(label='Pages to scrape (max. 17)', min_value=0, max_value=17, widget=forms.NumberInput(attrs={'style':'width: 188px'}))
AIRBNB_scraper is:
import requests, bs4
import re
import pandas as pd
price_pattern = re.compile(r'\d*\s*?,?\s*?\d*\szł')
photo_link_pattern = re.compile(r'https.*\)')
prices = []
listings_links = []
photo_links = []
listings_names = []
class AIRBNB_scraper():
def __init__(self, city, country, accomodation_type='homes', arrival_date='2018-03-25', departure_date='2018-04-10'):
self.city = city
self.country = country
self.arrival_date = arrival_date
self.departure_date = departure_date
self.accomodation_type = accomodation_type
def make_soup(self, page_number):
url = 'https://www.airbnb.pl/s/'+ self.city +'--'+ self.country +'/'+ self.accomodation_type +'?query='+ self.city +'%2C%20'+ self.country +'&refinement_paths%5B%5D=%2F'+ self.accomodation_type +'&checkin=' + self.arrival_date + '&checkout=' + self.departure_date + '&section_offset=' + str(page_number)
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings(self, page_number):
soup = self.make_soup(page_number)
listings = soup.select('._f21qs6')
number_of_listings = len(listings)
print('\n' + "Number of listings found: " + str(number_of_listings))
while number_of_listings != 18:
print('\n' + str(number_of_listings) + ' is not correct number of listings, it should be 18. Trying again now.')
soup = self.make_soup(page_number)
listings = soup.find_all('div', class_='_f21qs6')
number_of_listings = len(listings)
print('\n' + "All fine! The number of listings is: " + str(number_of_listings) + '. Starting scraping now')
return listings
def scrape_listings_per_page(self, page_number):
listings_to_scrape = self.get_listings(page_number)
for listing in listings_to_scrape:
#get price
price_container = listing.find_all('span', class_='_hylizj6')
price_search = re.search(price_pattern, str(price_container))
price = price_search.group()
#get listing_link
listing_link = 'https://www.airbnb.pl' + listing.find('a', class_='_15ns6vh')['href']
#get photo_link
photo_link_node = listing.find('div', class_="_1df8dftk")['style']
photo_link_search = re.search(photo_link_pattern, str(photo_link_node))
#~ if photo_link_search:
#~ print('Is regex match')
#~ else:
#~ print('No regex match')
photo_link_before_strip = photo_link_search.group()
photo_link = photo_link_before_strip[:-1] #remove ") at the end of link
#get listing_name
listing_name = listing.find('div', class_='_1rths372').text
#append lists
prices.append(price)
listings_links.append(listing_link)
photo_links.append(photo_link)
listings_names.append(listing_name)
def scrape_multiple_pages(self, last_page_selector_number):
last_page_selector_number += 1
for x in range(0, last_page_selector_number):#18
self.scrape_listings_per_page(x)
print('\n' + "INDEX OF PAGE BEING SCRAPED: " + str(x))
scraped_data = pd.DataFrame({'prices': prices,
'listings_links': listings_links,
'photo_links': photo_links,
'listings_names': listings_names})
return scraped_data
You have module-level variables: prices, listings_links, etc. You append to these inside your AIRBNB_scraper instance but they are not part of that instance, and will persist between calls. You should make them instance attributes - define them as self.prices etc in the __init__ method.

M2m relation breaks when passing filter parameters

I have a m2m relation between properties and images in my model like imageproperty = models.ManyToManyField(Property, blank = True). Im having an issue trying to filter properties with their associated images as whenever i pass a parameter in my query i get something like this and the images are not showing quiet good
. This is my code so far
def filter_properties(request, prop, p):
order = "creation_date"
if p["sort"]: order = p["sort"]
if p["asc_desc"] == "desc": order = '-' + order
results = Property.objects.filter(status = True)
for prop in results:
prop.images = prop.image_set.all()[:1] #Should i need to return in results so it brings values when filtering?
if p["name"] : results = results.filter(name__icontains=p["name"])
if p["price_from"] : results = results.filter(price__gte=int(p["price_from"]))
if p["price_to"] : results = results.filter(price__lte=int(p["price_to"]))
if p["category"]:
lst = p["category"]
or_query = Q(categories = lst[0])
for c in lst[1:]:
or_query = or_query | Q(categories = c)
results = results.filter(or_query).distinct()
return results
def search_properties_view(request):
try:
page = int(request.GET.get("page", '1'))
except ValueError:
page = 1
p = request.POST
prop = defaultdict(dict)
parameters = dict.fromkeys(
('name', 'price_from', 'price_to', 'currency_type', 'activity_type', 'sort', 'asc_desc'),
'',
)
parameters["category"] = []
for k, v in p.items():
if k == "category":
parameters[k] = [int(x) for x in p.getlist(k)]
elif k in parameters:
parameters[k] = v
elif k.startswith("name") or k.startswith("curency_type") or k.startswith("activity_type"):
k, pk = k.split('-')
prop[pk][k] = v
elif k.startswith("category"):
pk = k.split('-')[1]
prop[pk]["category"] = p.getlist(k)
if page != 1 and "parameters" in request.session:
parameters = request.session["parameters"]
else:
request.session["parameters"] = parameters
results = filter_properties(request, prop, parameters)
paginator = Paginator(results, 20)
try:
results = paginator.page(page)
except (InvalidPage, EmptyPage):
request = paginator.page(paginator.num_pages)
return render(request, 'propiedades/propiedades.html', {
'propiedades': request.POST,
'media_url': settings.MEDIA_URL,
'results': results,
'params': parameters,
'categories': PropertyCategory.objects.all()
})

Categories