【04】把 Elasticsearch 當資料庫使:按欄位聚合

TaoWen發表於2016-02-18

使用 https://github.com/taowen/es-monitor 可以用 SQL 進行 elasticsearch 的查詢。最簡單的聚合是把整個index當作一個巨大的bucket(桶),然後去數這個桶裡的數字。複雜一些的聚合我們先要能夠把文件分到不同的桶裡,然後去分別數每個桶的數字。統計聚合需要 分桶 (GROUP BY XXX INTO BUCKET),也需要 指標(METRIC)

GROUP BY exchange => COUNT(*)

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
select exchange, count(*) from symbol group by exchange                                                        
EOF
{"count(*)": 3240, "exchange": "nyse"}
{"count(*)": 3089, "exchange": "nasdaq"}
{"count(*)": 385, "exchange": "nyse mkt"}

Elasticsearch

{
  "aggs": {
    "exchange": {
      "terms": {
        "field": "exchange", 
        "size": 0
      }, 
      "aggs": {}
    }
  }, 
  "size": 0
}
{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 23, 
  "aggregations": {
    "exchange": {
      "buckets": [
        {
          "key": "nyse", 
          "doc_count": 3240
        }, 
        {
          "key": "nasdaq", 
          "doc_count": 3089
        }, 
        {
          "key": "nyse mkt", 
          "doc_count": 385
        }
      ], 
      "sum_other_doc_count": 0, 
      "doc_count_error_upper_bound": 0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.2003150000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 8029,
          "next_doc": 164079,
          "match": 0,
          "build_scorer": 28207,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 2657,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "1.245165000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2540820000ms"
          },
          {
            "name": "GlobalOrdinalsStringTermsAggregator: [exchange]",
            "reason": "aggregation",
            "time": "0.3493420000ms"
          }
        ]
      }
    ]
  }
]

GlobalOrdinalsStringTermsAggregator 這個 GROUP BY 的方式非常快

GROUP BY ipo_year => MAX(market_cap)

MAX(market_cap)COUNT(*)的區別在於,COUNT(*)不需要新增額外的metric聚合,每個GROUP BY自帶了doc_count的計算結果。
SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
select exchange, max(market_cap) from symbol group by exchange
EOF
{"max(market_cap)": 87022480321.0, "exchange": "nyse"}
{"max(market_cap)": 522690000000.0, "exchange": "nasdaq"}
{"max(market_cap)": 9586866593.0, "exchange": "nyse mkt"}

Elasticsearch

{
  "aggs": {
    "exchange": {
      "terms": {
        "field": "exchange", 
        "size": 0
      }, 
      "aggs": {
        "max(market_cap)": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }
  }, 
  "size": 0
}
{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 6, 
  "aggregations": {
    "exchange": {
      "buckets": [
        {
          "max(market_cap)": {
            "value": 87022480321.0
          }, 
          "key": "nyse", 
          "doc_count": 3240
        }, 
        {
          "max(market_cap)": {
            "value": 522690000000.0
          }, 
          "key": "nasdaq", 
          "doc_count": 3089
        }, 
        {
          "max(market_cap)": {
            "value": 9586866593.0
          }, 
          "key": "nyse mkt", 
          "doc_count": 385
        }
      ], 
      "sum_other_doc_count": 0, 
      "doc_count_error_upper_bound": 0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.2007250000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 8264,
          "next_doc": 164363,
          "match": 0,
          "build_scorer": 28098,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 2678,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "1.741733000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2256530000ms"
          },
          {
            "name": "GlobalOrdinalsStringTermsAggregator: [exchange]",
            "reason": "aggregation",
            "time": "0.8099970000ms"
          }
        ]
      }
    ]
  }
]

下鑽 GROUP BY exchange, sector

SQL裡分桶的方式可以是多級的,先按exchange分,接著再按sector進一步細分

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
select exchange, sector, max(market_cap) from symbol group by exchange, sector
EOF
{"sector": "n/a", "max(market_cap)": 1409695805.0, "exchange": "nyse"}
{"sector": "Consumer Services", "max(market_cap)": 46884855259.0, "exchange": "nyse"}
{"sector": "Finance", "max(market_cap)": 35827719192.0, "exchange": "nyse"}
{"sector": "Energy", "max(market_cap)": 86091463700.0, "exchange": "nyse"}
{"sector": "Public Utilities", "max(market_cap)": 24117360000.0, "exchange": "nyse"}
{"sector": "Basic Industries", "max(market_cap)": 27347618968.0, "exchange": "nyse"}
{"sector": "Capital Goods", "max(market_cap)": 82469796110.0, "exchange": "nyse"}
{"sector": "Technology", "max(market_cap)": 47882944000.0, "exchange": "nyse"}
{"sector": "Health Care", "max(market_cap)": 82041827564.0, "exchange": "nyse"}
{"sector": "Consumer Non-Durables", "max(market_cap)": 87022480321.0, "exchange": "nyse"}
{"sector": "Consumer Durables", "max(market_cap)": 8233713549.0, "exchange": "nyse"}
{"sector": "Transportation", "max(market_cap)": 9094527055.0, "exchange": "nyse"}
{"sector": "Miscellaneous", "max(market_cap)": 54171930444.0, "exchange": "nyse"}
{"sector": "Finance", "max(market_cap)": 30620000000.0, "exchange": "nasdaq"}
{"sector": "Health Care", "max(market_cap)": 126540000000.0, "exchange": "nasdaq"}
{"sector": "Technology", "max(market_cap)": 522690000000.0, "exchange": "nasdaq"}
{"sector": "Consumer Services", "max(market_cap)": 230940000000.0, "exchange": "nasdaq"}
{"sector": "n/a", "max(market_cap)": 34620000000.0, "exchange": "nasdaq"}
{"sector": "Capital Goods", "max(market_cap)": 20310000000.0, "exchange": "nasdaq"}
{"sector": "Consumer Non-Durables", "max(market_cap)": 87500000000.0, "exchange": "nasdaq"}
{"sector": "Miscellaneous", "max(market_cap)": 51420000000.0, "exchange": "nasdaq"}
{"sector": "Consumer Durables", "max(market_cap)": 7690000000.0, "exchange": "nasdaq"}
{"sector": "Basic Industries", "max(market_cap)": 9170000000.0, "exchange": "nasdaq"}
{"sector": "Energy", "max(market_cap)": 5760000000.0, "exchange": "nasdaq"}
{"sector": "Public Utilities", "max(market_cap)": 77810000000.0, "exchange": "nasdaq"}
{"sector": "Transportation", "max(market_cap)": 23530000000.0, "exchange": "nasdaq"}
{"sector": "n/a", "max(market_cap)": 971774087.0, "exchange": "nyse mkt"}
{"sector": "Basic Industries", "max(market_cap)": 424184478.0, "exchange": "nyse mkt"}
{"sector": "Health Care", "max(market_cap)": 93765452.0, "exchange": "nyse mkt"}
{"sector": "Energy", "max(market_cap)": 5199118597.0, "exchange": "nyse mkt"}
{"sector": "Consumer Services", "max(market_cap)": 99940496.0, "exchange": "nyse mkt"}
{"sector": "Capital Goods", "max(market_cap)": 49720054.0, "exchange": "nyse mkt"}
{"sector": "Technology", "max(market_cap)": 44979980.0, "exchange": "nyse mkt"}
{"sector": "Consumer Non-Durables", "max(market_cap)": 9586866593.0, "exchange": "nyse mkt"}
{"sector": "Finance", "max(market_cap)": 259074010.0, "exchange": "nyse mkt"}
{"sector": "Public Utilities", "max(market_cap)": 968077000.0, "exchange": "nyse mkt"}
{"sector": "Consumer Durables", "max(market_cap)": 50452938.0, "exchange": "nyse mkt"}
{"sector": "Miscellaneous", "max(market_cap)": 66549988.0, "exchange": "nyse mkt"}
{"sector": "Transportation", "max(market_cap)": 47179899.0, "exchange": "nyse mkt"}

Elasticsearch

{
  "aggs": {
    "exchange": {
      "terms": {
        "field": "exchange", 
        "size": 0
      }, 
      "aggs": {
        "sector": {
          "terms": {
            "field": "sector", 
            "size": 0
          }, 
          "aggs": {
            "max(market_cap)": {
              "max": {
                "field": "market_cap"
              }
            }
          }
        }
      }
    }
  }, 
  "size": 0
}
{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 11, 
  "aggregations": {
    "exchange": {
      "buckets": [
        {
          "sector": {
            "buckets": [
              {
                "max(market_cap)": {
                  "value": 1409695805.0
                }, 
                "key": "n/a", 
                "doc_count": 963
              }, 
              {
                "max(market_cap)": {
                  "value": 46884855259.0
                }, 
                "key": "Consumer Services", 
                "doc_count": 468
              }, 
              {
                "max(market_cap)": {
                  "value": 35827719192.0
                }, 
                "key": "Finance", 
                "doc_count": 384
              }, 
              {
                "max(market_cap)": {
                  "value": 86091463700.0
                }, 
                "key": "Energy", 
                "doc_count": 230
              }, 
              {
                "max(market_cap)": {
                  "value": 24117360000.0
                }, 
                "key": "Public Utilities", 
                "doc_count": 221
              }, 
              {
                "max(market_cap)": {
                  "value": 27347618968.0
                }, 
                "key": "Basic Industries", 
                "doc_count": 200
              }, 
              {
                "max(market_cap)": {
                  "value": 82469796110.0
                }, 
                "key": "Capital Goods", 
                "doc_count": 188
              }, 
              {
                "max(market_cap)": {
                  "value": 47882944000.0
                }, 
                "key": "Technology", 
                "doc_count": 186
              }, 
              {
                "max(market_cap)": {
                  "value": 82041827564.0
                }, 
                "key": "Health Care", 
                "doc_count": 113
              }, 
              {
                "max(market_cap)": {
                  "value": 87022480321.0
                }, 
                "key": "Consumer Non-Durables", 
                "doc_count": 111
              }, 
              {
                "max(market_cap)": {
                  "value": 8233713549.0
                }, 
                "key": "Consumer Durables", 
                "doc_count": 66
              }, 
              {
                "max(market_cap)": {
                  "value": 9094527055.0
                }, 
                "key": "Transportation", 
                "doc_count": 60
              }, 
              {
                "max(market_cap)": {
                  "value": 54171930444.0
                }, 
                "key": "Miscellaneous", 
                "doc_count": 50
              }
            ], 
            "sum_other_doc_count": 0, 
            "doc_count_error_upper_bound": 0
          }, 
          "key": "nyse", 
          "doc_count": 3240
        }, 
        {
          "sector": {
            "buckets": [
              {
                "max(market_cap)": {
                  "value": 30620000000.0
                }, 
                "key": "Finance", 
                "doc_count": 637
              }, 
              {
                "max(market_cap)": {
                  "value": 126540000000.0
                }, 
                "key": "Health Care", 
                "doc_count": 621
              }, 
              {
                "max(market_cap)": {
                  "value": 522690000000.0
                }, 
                "key": "Technology", 
                "doc_count": 449
              }, 
              {
                "max(market_cap)": {
                  "value": 230940000000.0
                }, 
                "key": "Consumer Services", 
                "doc_count": 354
              }, 
              {
                "max(market_cap)": {
                  "value": 34620000000.0
                }, 
                "key": "n/a", 
                "doc_count": 287
              }, 
              {
                "max(market_cap)": {
                  "value": 20310000000.0
                }, 
                "key": "Capital Goods", 
                "doc_count": 179
              }, 
              {
                "max(market_cap)": {
                  "value": 87500000000.0
                }, 
                "key": "Consumer Non-Durables", 
                "doc_count": 111
              }, 
              {
                "max(market_cap)": {
                  "value": 51420000000.0
                }, 
                "key": "Miscellaneous", 
                "doc_count": 97
              }, 
              {
                "max(market_cap)": {
                  "value": 7690000000.0
                }, 
                "key": "Consumer Durables", 
                "doc_count": 82
              }, 
              {
                "max(market_cap)": {
                  "value": 9170000000.0
                }, 
                "key": "Basic Industries", 
                "doc_count": 78
              }, 
              {
                "max(market_cap)": {
                  "value": 5760000000.0
                }, 
                "key": "Energy", 
                "doc_count": 70
              }, 
              {
                "max(market_cap)": {
                  "value": 77810000000.0
                }, 
                "key": "Public Utilities", 
                "doc_count": 68
              }, 
              {
                "max(market_cap)": {
                  "value": 23530000000.0
                }, 
                "key": "Transportation", 
                "doc_count": 56
              }
            ], 
            "sum_other_doc_count": 0, 
            "doc_count_error_upper_bound": 0
          }, 
          "key": "nasdaq", 
          "doc_count": 3089
        }, 
        {
          "sector": {
            "buckets": [
              {
                "max(market_cap)": {
                  "value": 971774087.0
                }, 
                "key": "n/a", 
                "doc_count": 123
              }, 
              {
                "max(market_cap)": {
                  "value": 424184478.0
                }, 
                "key": "Basic Industries", 
                "doc_count": 52
              }, 
              {
                "max(market_cap)": {
                  "value": 93765452.0
                }, 
                "key": "Health Care", 
                "doc_count": 52
              }, 
              {
                "max(market_cap)": {
                  "value": 5199118597.0
                }, 
                "key": "Energy", 
                "doc_count": 32
              }, 
              {
                "max(market_cap)": {
                  "value": 99940496.0
                }, 
                "key": "Consumer Services", 
                "doc_count": 28
              }, 
              {
                "max(market_cap)": {
                  "value": 49720054.0
                }, 
                "key": "Capital Goods", 
                "doc_count": 25
              }, 
              {
                "max(market_cap)": {
                  "value": 44979980.0
                }, 
                "key": "Technology", 
                "doc_count": 20
              }, 
              {
                "max(market_cap)": {
                  "value": 9586866593.0
                }, 
                "key": "Consumer Non-Durables", 
                "doc_count": 15
              }, 
              {
                "max(market_cap)": {
                  "value": 259074010.0
                }, 
                "key": "Finance", 
                "doc_count": 13
              }, 
              {
                "max(market_cap)": {
                  "value": 968077000.0
                }, 
                "key": "Public Utilities", 
                "doc_count": 12
              }, 
              {
                "max(market_cap)": {
                  "value": 50452938.0
                }, 
                "key": "Consumer Durables", 
                "doc_count": 5
              }, 
              {
                "max(market_cap)": {
                  "value": 66549988.0
                }, 
                "key": "Miscellaneous", 
                "doc_count": 5
              }, 
              {
                "max(market_cap)": {
                  "value": 47179899.0
                }, 
                "key": "Transportation", 
                "doc_count": 3
              }
            ], 
            "sum_other_doc_count": 0, 
            "doc_count_error_upper_bound": 0
          }, 
          "key": "nyse mkt", 
          "doc_count": 385
        }
      ], 
      "sum_other_doc_count": 0, 
      "doc_count_error_upper_bound": 0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.2216380000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 11316,
          "next_doc": 180546,
          "match": 0,
          "build_scorer": 29776,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 2456,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "3.587216000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2345190000ms"
          },
          {
            "name": "GlobalOrdinalsStringTermsAggregator: [exchange]",
            "reason": "aggregation",
            "time": "2.605895000ms"
          }
        ]
      }
    ]
  }
]

相關文章