一、介绍
Searx 是一个免费的互联网元搜索引擎,集成各个搜索服务(例如 baidu google bind),可以自定义添加搜索引擎轻松集成。
二、如何自定义搜索引擎
1、在 engine 目录添加py文件,并编写 request(生成请求参数) 和 response(格式化返回结果) 方法,服务内置发送请求方法。模板如下:
categories = ['general'] # optional
def request(query, params):
'''pre-request callback
params<dict>:
method : POST/GET
headers : {}
data : {} # if method == POST
url : ''
category: 'search category'
pageno : 1 # number of the requested page
'''
params['url'] = 'https://host/%s' % query
return params
def response(resp):
'''post-response callback
resp: requests response object
'''
return [{'url': '', 'title': '', 'content': ''}]
2、在settings.yml 配置文件 添加 引擎配置
三、源码分析
1、多线程获取搜索结果
def search_multiple_requests(self, requests):
search_id = uuid4().__str__()
for engine_name, query, request_params in requests:
th = threading.Thread(
target=PROCESSORS[engine_name].search,
args=(query, request_params, self.result_container, self.start_time, self.actual_timeout, engine_name),
name=search_id,
)
th._timeout = False
th._engine_name = engine_name
th.start()
for th in threading.enumerate():
if th.name == search_id:
remaining_time = max(0.0, self.actual_timeout - (time() - self.start_time))
th.join(remaining_time)
if th.is_alive():
th._timeout = True
self.result_container.add_unresponsive_engine(th._engine_name, 'timeout')
logger.warning('engine timeout: {0}'.format(th._engine_name))
2、加载 搜索引擎
def load_engine(engine_data):
engine_name = engine_data['name']
if '_' in engine_name:
logger.error('Engine name contains underscore: "{}"'.format(engine_name))
sys.exit(1)
if engine_name.lower() != engine_name:
logger.warn('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name))
engine_name = engine_name.lower()
engine_data['name'] = engine_name
engine_module = engine_data['engine']
try:
# 加载 引擎
engine = load_module(engine_module + '.py', engine_dir)
except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError):
logger.exception('Fatal exception in engine "{}"'.format(engine_module))
sys.exit(1)
except:
logger.exception('Cannot load engine "{}"'.format(engine_module))
return None
3、加载搜索引擎方法 load_module (插件的需求 借鉴)
def load_module(filename, module_dir):
modname = splitext(filename)[0]
if modname in sys.modules:
del sys.modules[modname]
filepath = join(module_dir, filename)
# and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
spec = importlib.util.spec_from_file_location(modname, filepath)
module = importlib.util.module_from_spec(spec)
sys.modules[modname] = module
spec.loader.exec_module(module)
return module
4、值得借鉴的地方
引擎必须继承该类,实现 search方法。调用 搜索引擎 时用的就是search方法
class EngineProcessor(ABC):
@abstractmethod
def search(self, query, params, result_container, start_time, timeout_limit):
pass
start_new_thread(gc.collect, tuple()) 新增线程 垃圾回收 防止内存泄露
def search_standard(self):
"""
Update self.result_container, self.actual_timeout
"""
requests, self.actual_timeout = self._get_requests()
print(f"zsq 多线程 查询start {time()}")
# send all search-request
if requests:
self.search_multiple_requests(requests)
#开启一个新线程回收垃圾
start_new_thread(gc.collect, tuple())
print(f"多线程 查询end {time()}")
# return results, suggestions, answers and infoboxes
return True