Searx/ImmortalPoetry
From LinuxReviews
Jump to navigationJump to search
Searx search plugin for immortalpoetry.com
File: searx/engines/immortalpoetry.py
""" @website https://immortalpoetry.com/ @provide-api yes (http://www.mediawiki.org/wiki/API:Search) @using-api yes @results JSON @stable yes @parse url, title, content @todo change search URL depending on language """ from json import loads from string import Formatter from lxml.html import fromstring from searx.url_utils import urlencode, quote from searx.utils import html_to_text # hmm # engine dependent config categories = ['general'] language_support = True paging = True number_of_results = 5 search_type = 'title' # possible values: title, text, nearmatch supported_languages = ['en', 'nb', 'no', 'sv'] base_url = 'https://linuxreviews.org/' search_postfix = 'w/api.php?action=query'\ '&list=search'\ '&{query}'\ '&format=json'\ '&sroffset={offset}'\ '&srlimit={limit}' # '&srwhat={searchtype}' # get first meaningful paragraph # try to avoid [[File:images]] and [[links]] # TODO: Just filter [ and ] and remove if it's an image def extract_first_paragraph(content): first_paragraph = None failed_attempts = 0 for wparagraph in content.split(']'): for paragraph in wparagraph.split('['): length = len(paragraph) if length >= 30: first_paragraph = paragraph break failed_attempts += 1 if failed_attempts > 5: return None return first_paragraph # do search-request def request(query, params): offset = (params['pageno'] - 1) * number_of_results string_args = dict(query=urlencode({'srsearch': query}), offset=offset, limit=number_of_results, searchtype=search_type) if params['language'] == 'all': lang = 'en-US' else: lang = params['language'] if lang == 'en': lang = 'en-US' if lang == 'en-US': site = 'immortalpoetry.com' elif lang == 'nb-NO': site = 'dikt.org' elif lang == 'sv-SE': site = 'svenskadikter.com' else: return None site_url = 'https://' + site + '/' #format_strings = list(Formatter().parse(base_url)) search_url = site_url + search_postfix params['url'] = search_url.format(**string_args) params['site_url'] = site_url.format(**string_args) return params # get response from search-request def response(resp): results = [] search_results = loads(resp.text) language = resp.search_params['language'] site_url = resp.search_params['site_url'] # return empty array if there are no results if not search_results.get('query', {}).get('search'): return [] # parse results for result in search_results['query']['search']: if result.get('snippet', '').startswith('#REDIRECT'): continue url = site_url + quote(result['title'].replace(' ', '_').encode('utf-8')) extract = result['snippet'] exttext = html_to_text(extract) summary = extract_first_paragraph(exttext) # append result results.append({'url': url, 'title': result['title'], 'content': summary}) # return results return results
The plugin can use configuration such as:
File: searx/settings.yml
- name : immortalpoetry engine : immortalpoetry shortcut : ip weight : 2 number_of_results : 5