Highlight exact phrase with haystack/elasticsearch in Django - python

My web app uses Django Haystack with Elasticsearch as the search engine.
My SearchForm child class filters for exact search (content__exact parameter) if the search query contains a token with quotes.
class NepSearchForm(SearchForm):
# ...
def search(self):
if not self.is_valid():
return self.no_query_found()
if not self.cleaned_data.get('q'):
return self.no_query_found()
sqs = self._parse_query(self.cleaned_data['q'])
if self.load_all:
sqs = sqs.load_all()
return sqs
def no_query_found(self):
return self.searchqueryset.all()
def _parse_query(self, query):
"""
Parse query treating modifiers 'AND', 'OR', 'NOT' to make what they're
supposed to.
:param query: query entered in search input box in form
:param sqs: SearchQuerySet until now
:return: SearchQuerySet object
"""
words = iter(shlex.split(query))
result = self.searchqueryset
for word in words:
try:
if word == 'AND':
result = result.filter_and(content=words.__next__())
elif word == 'OR':
# TODO: fail when changing order of the words. See
# TODO: functional test:
# TODO: test_search_with_OR_modifier_returns_correct_objects
result = result.filter_or(content=words.__next__())
elif word == 'NOT':
result = result.exclude(content=words.__next__())
# if "word" is compounded of more than one non blank word the
# term is inside quotes
elif len(word.split()) > 1:
result = result.filter(content__exact=word)
else:
result = result.filter(content=word)
except StopIteration:
return result
return result
I'm using the Django template tag {% highlight %} to highlight the terms searched in my app, like in:
{% highlight result.object.<field> with query %}
What I'm seeing is that, when I make a search with quotes with more than one word, separated by spaces, e.g "História de fratura", the search results appears with only the token "de" highlighted. So it seems that Highlighter class does not treat terms with quotes as single tokens to mark them highlighted in search results.
What can I do to highlight the query with a whole term inside quotes in search results?

You can build your own highlighter class as documentation states if default highlighter implementation doesn't work for you.

Related

add parameters inside url

i want to build some function that read a url from txt file, then save it to some variable, then add some values inside the url between another values
example of the url: https://domains.livedns.co.il/API/DomainsAPI.asmx/NewDomain?UserName=apidemo#livedns.co.il&Password=demo
lets say i want to inject some values between UserName and Password and save it into file again and use it later.
i started to write the function and play with urllib parser but i still doesnt understand how to do that.
what i tried until now:
def dlastpurchase():
if os.path.isfile("livednsurl.txt"):
apikeyfile = open("livednsurl.txt", "r")
apikey = apikeyfile.read()
url_parse = urlsplit(apikey)
print(url_parse.geturl())
dlastpurchase()
Thanks in advance for every tip and help
A little bit more complex example that I believe you will find interesting and also enjoy improving it (while it takes care of some scenarios, it might be lacking in some). Also functional to enable reuse in other cases. Here we go
assuming we have a text file, named 'urls.txt' that contains this url
https://domains.livedns.co.il/API/DomainsAPI.asmx/NewDomain?UserName=apidemo#livedns.co.il&Password=demo
from os import error
from urllib.parse import urlparse, parse_qs, urlunparse
filename = 'urls.txt'
function to parse the url and return its query parameters as well as the url object, which will be used to reconstruct the url later on
def parse_url(url):
"""parse a given url and return its query parameters
Args:
url (string): url string to parse
Returns:
parsed (tupple): the tupple object returned by urlparse
query_parameters (dictionary): dictionary containing the query parameters as keys
"""
try :
# parse the url and get the queries parameters from there
parsed = urlparse(url)
# parse the queries and return the dictionary containing them
query_result = parse_qs(parsed.query)
return (query_result, parsed)
except(error):
print('something failed !!!')
print(error)
return False
function to add a new query parameter or to replace an existing one
def insert_or_replace_word(query_dic, word,value):
"""Insert a value for the query parameters of a url
Args:
query_dic (object): the dictionary containing the query parameters
word (string): the query parameter to replace or insert values for
value (string): the value to insert or use as replacement
Returns:
result (string):the result of the insertion or replacement
"""
try:
query_dic[word] = value
return query_dic
except (error):
print('Something went wrong {0}'.format(error))
function to format the query parameter and get them ready to reconstruct the new url
def format_query_strings(query_dic):
"""format the final query dictionaries ready to be used to construct a new url and construct the new url
Args:
query_dic (dictionary): final query dictionary after insertion or update
"""
final_string = ''
for key, value in query_dic.items():
#unfortunatly, query params from parse_qs are in list, so remove them before creating the final string
if type(value) == list:
query_string = '{0}={1}'.format(key, value[0])
final_string += '{0}&'.format(query_string)
else:
query_string = '{0}={1}'.format(key, value)
final_string += '{0}&'.format(query_string)
# this is to remove any extra & inserted at the end of the loop above
if final_string.endswith('&'):
final_string = final_string[:len(final_string)-1]
return final_string
we check out everything works by reading in text file, performing above operation and then saving the new url to a new file
with open(filename) as url:
lines = url.readlines()
for line in lines:
query_params,parsed = parse_url(line)
new_query_dic = insert_or_replace_word(query_params,'UserName','newUsername')
final = format_query_strings(new_query_dic)
#here you have to pass an iterable of lenth 6 in order to reconstruct the url
new_url_object = [parsed.scheme,parsed.netloc,parsed.path,parsed.params,final,parsed.fragment]
#this reconstructs the new url
new_url = urlunparse(new_url_object)
#create a new file and append the link inside of it
with open('new_urls.txt', 'a') as new_file:
new_file.writelines(new_c)
new_file.write('\n')
You don't have to use fancy tools to do that. Just split the url based on "?" Character. Then, split the second part based on "&" character. Add your new params to the list you have, and merge them with the base url you get.
url = "https://domains.livedns.co.il/API/DomainsAPI.asmx/NewDomain?UserName=apidemo#livedns.co.il&Password=demo"
base, params = url.split("?")
params = params.split("&")
params.insert(2, "new_user=yololo&new_passwd=hololo")
for param in params:
base += param + "&"
base = base.strip("&")
print(base)
I did it like this since you asked for inserting to a specific location. But url params are not depends on the order, so you can just append at the end of the url for ease. Or, you can edit the parameters from the list I show.

Bold text with asterisks

In my Django project I want to make text bold if asterisks * are there at the start and end of text, the same feature we have here on Stack Overflow. Although I convert ** to <b>, due to output escaping it becomes <b>. What is the right approach to achieve this?
template file contains {{ anidea.description|format_text}}
format_text is custom template filter
code..
from django import template
from django.utils.safestring import mark_safe
register = template.Library()
#register.filter(name='format_text')
def custom_formating(value):
for word in value.split():
start = word[:2]
end = word[-2:]
if start == '**' and end == '**':
word = word[2:-2]
word = '<b>' +word+ '</b>'
mark_safe(word)
return value
if you want the full suite of all markdown features, go with an existing markdown library.
if you just want <b> to print directly to the source code w/o escaping, use
{{ some_var|safe }}
I did it in following way.
views.py
i.description = i.description.split() #use of split()
template file (format_text is custom template filter)
{% for text in anidea.description %}
{{ text|format_text }}
{% endfor %}
filter
#register.filter(name='format_text')
def custom_formating(value):
start = value[:2]
end = value[-2:]
if start == '**' and end == '**':
value = value[2:-2]
value = '<b>' +value+ '</b>'
return mark_safe(value)
else:
return value
with this way I can achieve output escaping for description and desired text formatting.

Wikipedia Infobox parser with Multi-Language Support

I am trying to develop an Infobox parser in Python which supports all the languages of Wikipedia. The parser will get the infobox data and will return the data in a Dictionary.
The keys of the Dictionary will be the property which is described (e.g. Population, City name, etc...).
The problem is that Wikipedia has slightly different page contents for each language. But the most important thing is that the API response structure for each language can also be different.
For example, the API response for 'Paris' in English contains this Infobox:
{{Infobox French commune |name = Paris |commune status = [[Communes of France|Commune]] and [[Departments of France|department]] |image = <imagemap> File:Paris montage.jpg|275px|alt=Paris montage
and in Greek, the corresponding part for 'Παρίσι' is:
[...] {{Πόλη (Γαλλία) | Πόλη = Παρίσι | Έμβλημα =Blason paris 75.svg | Σημαία =Mairie De Paris (SVG).svg | Πλάτος Σημαίας =120px | Εικόνα =Paris - Eiffelturm und Marsfeld2.jpg [...]
In the second example, there isn't any 'Infobox' occurrence after the {{. Also, in the API response the name = Paris is not the exact translation for Πόλη = Παρίσι. (Πόλη means city, not name)
Because of such differences between the responses, my code fails.
Here is the code:
class WikipediaInfobox():
# Class to get and parse the Wikipedia Infobox Data
infoboxArrayUnprocessed = [] # Maintains the order which the data is displayed.
infoboxDictUnprocessed = {} # Still Contains Brackets and Wikitext coding. Will be processed more later...
language="en"
def getInfoboxDict(self, infoboxRaw): # Get the Infobox in Dict and Array form (Unprocessed)
if infoboxRaw.strip() == "":
return {}
boxLines = [line.strip().replace(" "," ") for line in infoboxRaw.splitlines()]
wikiObjectType = boxLines[0]
infoboxData = [line[1:] for line in boxLines[1:]]
toReturn = {"wiki_type":wikiObjectType}
for i in infoboxData:
key = i.split("=")[0].strip()
value = ""
if i.strip() != key + "=":
value=i.split("=")[1].strip()
self.infoboxArrayUnprocessed.append({key:value})
toReturn[key]=value
self.infoboxDictUnprocessed = toReturn
return toReturn
def getInfoboxRaw(self, pageTitle, followRedirect = False, resetOld=True): # Get Infobox in Raw Text
if resetOld:
infoboxDict = {}
infoboxDictUnprocessed = {}
infoboxArray = []
infoboxArrayUnprocessed = []
params = { "format":"xml", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" }
params["titles"] = "%s" % urllib.quote(pageTitle.encode("utf8"))
qs = "&".join("%s=%s" % (k, v) for k, v in params.items())
url = "http://" + self.language + ".wikipedia.org/w/api.php?%s" % qs
tree = etree.parse(urllib.urlopen(url))
revs = tree.xpath('//rev')
if len(revs) == 0:
return ""
if "#REDIRECT" in revs[-1].text and followRedirect == True:
redirectPage = revs[-1].text[revs[-1].text.find("[[")+2:revs[-1].text.find("]]")]
return self.getInfoboxRaw(redirectPage,followRedirect,resetOld)
elif "#REDIRECT" in revs[-1].text and followRedirect == False:
return ""
infoboxRaw = ""
if "{{Infobox" in revs[-1].text: # -> No Multi-language support:
infoboxRaw = revs[-1].text.split("{{Infobox")[1].split("}}")[0]
return infoboxRaw
def __init__(self, pageTitle = "", followRedirect = False): # Constructor
if pageTitle != "":
self.language = guess_language.guessLanguage(pageTitle)
if self.language == "UNKNOWN":
self.language = "en"
infoboxRaw = self.getInfoboxRaw(pageTitle, followRedirect)
self.getInfoboxDict(infoboxRaw) # Now the parsed data is in self.infoboxDictUnprocessed
Some parts of this code was found on this blog...
I don't want to reinvent the wheel, so maybe someone has a nice solution for multi-language support and neat parsing of the Infobox section of Wikipedia.
I have seen many alternatives, like DBPedia or some other parsers that MediaWiki recommends, but I haven't found anything that suits my needs, yet. I also want to avoid scraping the page with BeautifulSoup, because it can fail on some cases, but if it is necessary it will do.
If something isn't clear enough, please ask. I want to help as much as I can.
Wikidata is definitely the first choice these days if you want to get structured data, anyway if in the future you need to parse data from wikipedia articles, especially as you are using Python, I can recommand mwparserfromhell which is a python library aimed at parsing wikitext and that has an option to extract templates and their attributes. That won't directly fix your issue as the multiple templates in multiple languages will definitely be different but that might be useful if you continue trying to parse wikitext.

Jinja2 filter to convert custom markup to html

Having the autoescape property on (I want to keep it that way), I want user to be able to enter some custom markup, to have the opportunity to format text. For example, [s][/s] will be translated into <strong></strong>. I believe the right way to do this is to write the custom Jinja2 filter. But the following doesn't work:
#app.template_filter()
#evalcontextfilter
def mark2html(eval_ctx, value):
result = escape(value).replace('[s]','<strong>')
if eval_ctx.autoescape:
result = Markup(result)
return result
When applied to text like
<div>{{ custom_markup_text|mark2html }}</div>
When [s] is encountered in the string, stored in custom_markup_text, it should be converted to <strong> tag. AFAIK, Markup() function ensures that we trust this particular string, so that HTML is not escaped there. The filter is successfully applied, [s] is replaced by <strong>, but it's still escaped.
Obviously, the autoescaping is done after this custom filter. On the other hand, example filter from Jinja2 documentation works perfectly:
#app.template_filter()
#evalcontextfilter
def nl2br(eval_ctx, value):
result = u'\n\n'.join(u'<p>%s</p>' % p.replace('\n', '<br>\n') \
for p in _paragraph_re.split(escape(value)))
if eval_ctx.autoescape:
result = Markup(result)
return result
What am I doing wrong?
Problem found. It's double escaping the string - rather silly.
This code works flawlessly:
#app.template_filter()
#evalcontextfilter
def mark2html(eval_ctx, value):
result = value.replace('[s]',u'<strong>')
result = result.replace('[/s]',u'</strong>')
if eval_ctx.autoescape:
result = Markup(result)
return result
Note, value shouldn't be escaped, as autoescape property is on.

Union and Intersect in Django

class Tag(models.Model):
name = models.CharField(maxlength=100)
class Blog(models.Model):
name = models.CharField(maxlength=100)
tags = models.ManyToManyField(Tag)
Simple models just to ask my question.
I wonder how can i query blogs using tags in two different ways.
Blog entries that are tagged with "tag1" or "tag2":
Blog.objects.filter(tags_in=[1,2]).distinct()
Blog objects that are tagged with "tag1" and "tag2" : ?
Blog objects that are tagged with exactly "tag1" and "tag2" and nothing else : ??
Tag and Blog is just used for an example.
You could use Q objects for #1:
# Blogs who have either hockey or django tags.
from django.db.models import Q
Blog.objects.filter(
Q(tags__name__iexact='hockey') | Q(tags__name__iexact='django')
)
Unions and intersections, I believe, are a bit outside the scope of the Django ORM, but its possible to to these. The following examples are from a Django application called called django-tagging that provides the functionality. Line 346 of models.py:
For part two, you're looking for a union of two queries, basically
def get_union_by_model(self, queryset_or_model, tags):
"""
Create a ``QuerySet`` containing instances of the specified
model associated with *any* of the given list of tags.
"""
tags = get_tag_list(tags)
tag_count = len(tags)
queryset, model = get_queryset_and_model(queryset_or_model)
if not tag_count:
return model._default_manager.none()
model_table = qn(model._meta.db_table)
# This query selects the ids of all objects which have any of
# the given tags.
query = """
SELECT %(model_pk)s
FROM %(model)s, %(tagged_item)s
WHERE %(tagged_item)s.content_type_id = %(content_type_id)s
AND %(tagged_item)s.tag_id IN (%(tag_id_placeholders)s)
AND %(model_pk)s = %(tagged_item)s.object_id
GROUP BY %(model_pk)s""" % {
'model_pk': '%s.%s' % (model_table, qn(model._meta.pk.column)),
'model': model_table,
'tagged_item': qn(self.model._meta.db_table),
'content_type_id': ContentType.objects.get_for_model(model).pk,
'tag_id_placeholders': ','.join(['%s'] * tag_count),
}
cursor = connection.cursor()
cursor.execute(query, [tag.pk for tag in tags])
object_ids = [row[0] for row in cursor.fetchall()]
if len(object_ids) > 0:
return queryset.filter(pk__in=object_ids)
else:
return model._default_manager.none()
For part #3 I believe you're looking for an intersection. See line 307 of models.py
def get_intersection_by_model(self, queryset_or_model, tags):
"""
Create a ``QuerySet`` containing instances of the specified
model associated with *all* of the given list of tags.
"""
tags = get_tag_list(tags)
tag_count = len(tags)
queryset, model = get_queryset_and_model(queryset_or_model)
if not tag_count:
return model._default_manager.none()
model_table = qn(model._meta.db_table)
# This query selects the ids of all objects which have all the
# given tags.
query = """
SELECT %(model_pk)s
FROM %(model)s, %(tagged_item)s
WHERE %(tagged_item)s.content_type_id = %(content_type_id)s
AND %(tagged_item)s.tag_id IN (%(tag_id_placeholders)s)
AND %(model_pk)s = %(tagged_item)s.object_id
GROUP BY %(model_pk)s
HAVING COUNT(%(model_pk)s) = %(tag_count)s""" % {
'model_pk': '%s.%s' % (model_table, qn(model._meta.pk.column)),
'model': model_table,
'tagged_item': qn(self.model._meta.db_table),
'content_type_id': ContentType.objects.get_for_model(model).pk,
'tag_id_placeholders': ','.join(['%s'] * tag_count),
'tag_count': tag_count,
}
cursor = connection.cursor()
cursor.execute(query, [tag.pk for tag in tags])
object_ids = [row[0] for row in cursor.fetchall()]
if len(object_ids) > 0:
return queryset.filter(pk__in=object_ids)
else:
return model._default_manager.none()
I've tested these out with Django 1.0:
The "or" queries:
Blog.objects.filter(tags__name__in=['tag1', 'tag2']).distinct()
or you could use the Q class:
Blog.objects.filter(Q(tags__name='tag1') | Q(tags__name='tag2')).distinct()
The "and" query:
Blog.objects.filter(tags__name='tag1').filter(tags__name='tag2')
I'm not sure about the third one, you'll probably need to drop to SQL to do it.
Please don't reinvent the wheel and use django-tagging application which was made exactly for your use case. It can do all queries you describe, and much more.
If you need to add custom fields to your Tag model, you can also take a look at my branch of django-tagging.
This will do the trick for you
Blog.objects.filter(tags__name__in=['tag1', 'tag2']).annotate(tag_matches=models.Count(tags)).filter(tag_matches=2)

Categories