elasticsearch查詢之大資料集分頁效能分析

無風聽海發表於2022-02-09

一、測試環境

python 3.7
elasticsearch 6.8
elasticsearch-dsl 7

安裝elasticsearch-dsl

pip install elasticsearch-dsl

測試elasticsearch連通性

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search


client = Elasticsearch(hosts=['http://127.0.0.1:9200'])
s = Search(using=client, index="my_store_index") .query("match_phrase_prefix", name="us")
s = s.source(['id'])
s = s.params(http_auth=["test", "test"])
response = s.execute()

for hit in response:
    print(hit.meta.score, hit.name)

11.642133 945d0426-033e-4a8a-86db-b776c6c9a082
11.642133 3c1aead4-aa6f-4256-a126-f29f84c9ac89
11.642133 77782add-ab58-4eb6-85af-bcbe79be9623
11.642133 75a02b9a-be31-4a78-a3d9-9af72f98cbf9
11.642133 d5aacf16-61fc-4f0c-b05d-3d57c8ab6236
11.642133 30912e1d-4662-4f24-bd5b-5a997e44c290
11.642133 95c28501-66a6-4786-917b-0f1e38707648
11.642133 605f4e11-08c8-4d60-b803-7925cf325cea
11.642133 5dd93a29-e75c-44e3-9f26-bd90e588bc1d
11.642133 84e97af5-4e99-466f-bd82-10cd2b79aa18

二、from + size一次性返回大量資料效能測試

通過以下code,直接使用from + size返回100000記錄,耗時17279ms;

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q

def from_size_query(client):
    s = Search(using=client, index="my_store_index")
    s = s.params(http_auth=["test", "test"], request_timeout=50);
    q = Q('bool',
        must_not=[Q('match_phrase_prefix', name='us')]
    )
    s = s.query(q)
    
    s = s.source(['id'])
    s = s[0:100000]
    response = s.execute()
    
    print(f'hit total {response.hits.total}')
    print(f'request time {response.took}ms')

client = Elasticsearch(hosts=['http://127.0.0.1:9200'])
from_size_query(client)

hit total 485070
request time 17279ms

三、使用search after分頁返回大量資料效能測試

通過以下code,使用search_after分多次共返回100000記錄;從執行結果可以看到當每頁獲取記錄達到5000時,執行的時間基本變化不大;考慮到size增大對cpu和記憶體的影響,在測試資料情況下,size設定為3000或者4000比較合適;

def search_after_query(client, result):
    s = Search(using=client, index="my_store_index")
    s = s.params(http_auth=["test", "test"], request_timeout=50);
    q = Q('bool',
          must_not=[Q('match_phrase_prefix', name='us')]
          )
    s = s.query(q)
    if result['after_value']:
        s = s.extra(search_after= [result['after_value']])


    s = s.source(['id'])
    s = s[:result['size']]
    s = s.sort('id')
    response = s.execute()

    fetch = len(response.hits)
    result['total'] += response.took
    result['times'] -= 1


    while fetch == result['size'] and  result['times'] > 0:
        sort_val = response.hits.hits[-1].sort[-1]
        s = s.extra(search_after=[sort_val])
        response = s.execute()

        fetch = len(response.hits)
        result['total'] += response.took
        result['times'] -= 1




client = Elasticsearch(hosts=['http://127.0.0.1:9200'])
times = 100
result = {"total": 0, "times":times, "size": 1000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 50
result = {"total": 0, "times":times, "size": 2000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 25
result = {"total": 0, "times":times, "size": 4000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 20
result = {"total": 0, "times":times, "size": 5000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 10
result = {"total": 0, "times":times, "size": 10000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 5
result = {"total": 0, "times":times, "size": 20000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 2
result = {"total": 0, "times":times, "size": 50000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')



size 1000  request  100 times total 14111ms 
size 2000  request  50 times total 11987ms 
size 4000  request  25 times total 11167ms 
size 5000  request  20 times total 10589ms 
size 10000  request  10 times total 9930ms 
size 20000  request  5 times total 9978ms  
size 50000  request  2 times total 9946ms 

四、使用scroll分頁返回大量資料效能測試

通過以下code,使用search_after分多次共取回100000記錄;從執行結果通過不同的size獲取資料,執行的時間變化不大,所以elasticsearch官方也不建議使用scroll;

def search_scroll_query(client, result):
    s = Search(using=client, index="my_store_index")
    s = s.params( request_timeout=50, scroll='1m');
    q = Q('bool',
          must_not=[Q('match_phrase_prefix', name='us')]
          )
    s = s.query(q)

    s = s.source(['id'])
    s = s[:result['size']]
    response = s.execute()

    fetch = len(response.hits)
    result['total'] += response.took
    result['times'] -= 1
    scroll_id = response._scroll_id


    while fetch == result['size']  and  result['times'] > 0:
        response = client.scroll(scroll_id=scroll_id, scroll='1m', request_timeout=50)
        scroll_id = response['_scroll_id']
        fetch = len(response['hits']['hits'])
        result['total'] += response['took']
        result['times'] -= 1

client = Elasticsearch(hosts=['http://127.0.0.1:9200'], http_auth=["test", "test"])

times = 100
result = {"total": 0, "times":times, "size": 1000}
search_scroll_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 50
result = {"total": 0, "times":times, "size": 2000}
search_scroll_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 25
result = {"total": 0, "times":times, "size": 4000}
search_scroll_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 20
result = {"total": 0, "times":times, "size": 5000}
search_scroll_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 10
result = {"total": 0, "times":times, "size": 10000}
search_scroll_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 5
result = {"total": 0, "times":times, "size": 20000}
search_scroll_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')

times = 2
result = {"total": 0, "times":times, "size": 50000}
search_scroll_query(client, result)
print(f'size {result["size"]}  request  {times} times total {result["total"]}ms ')


size 1000  request  100 times total 16573ms 
size 2000  request  50 times total 17678ms 
size 4000  request  25 times total 16719ms 
size 5000  request  20 times total 16031ms 
size 10000  request  10 times total 16008ms 
size 20000  request  5 times total 16074ms 
size 50000  request  2 times total 14390ms 

五、測試總結

通過對以上三種分頁方式的效能測試,可以看到對於獲取10W條記錄級別的資料集,search_after的效能最好,在不考慮其他效能優化的基礎上建議,可以考慮此種分頁方式;

相關文章