search(13)- elastic4s-histograms:聚合直方圖

雪川大蟲發表於2020-05-15

在聚合的分組統計中我們會面臨兩種分組元素型別:連續型如時間,自然數等、離散型如地點、產品等。離散型資料本身就代表不同的組別,但連續型資料則需要手工按等長間隔進行切分了。下面是一個按價錢段聚合的例子:

POST /cartxns/_search
{
  "size" : 1,
  "aggs": {
    "sales_per_pricerange": {
      "histogram": {
        "field": "price",
        "interval": 20000
      },
      "aggs": {
        "total sales": {
          "sum": {
            "field": "price"
          }
        }
      }
    }
  }
 }
}

在上面這個例子中我們把價錢按20000進行分段。得出0-19999,20000-39999,40000-59999 ... 價格段的度量:

  "aggregations" : {
    "sales_per_pricerange" : {
      "buckets" : [
        {
          "key" : 0.0,
          "doc_count" : 3,
          "total sales" : {
            "value" : 37000.0
          }
        },
        {
          "key" : 20000.0,
          "doc_count" : 4,
          "total sales" : {
            "value" : 95000.0
          }
        },
        {
          "key" : 40000.0,
          "doc_count" : 0,
          "total sales" : {
            "value" : 0.0
          }
        },
        {
          "key" : 60000.0,
          "doc_count" : 0,
          "total sales" : {
            "value" : 0.0
          }
        },
        {
          "key" : 80000.0,
          "doc_count" : 1,
          "total sales" : {
            "value" : 80000.0
          }
        }
      ]
    }
  }

在elastic4s中是這樣表達的:

  val aggHist = search("cartxns").aggregations(
    histogramAggregation("sales_per_price")
      .field("price")
      .interval(20000).subAggregations(
      sumAggregation("total_sales").field("price")
    )
  )
  println(aggHist.show)

  val histResult = client.execute(aggHist).await

  if (histResult.isSuccess)
    histResult.result.aggregations.histogram("sales_per_price").buckets
        .foreach(hb => println(s"${hb.key},${hb.docCount}:${hb.sum("total_sales").value}"))
  else println(s"error: ${histResult.error.reason}")

....

POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_price":{"histogram":{"interval":20000.0,"field":"price"},"aggs":{"total_sales":{"sum":{"field":"price"}}}}}},Some(application/json))
0.0,3:37000.0
20000.0,4:95000.0
40000.0,0:0.0
60000.0,0:0.0
80000.0,1:80000.0

下面這個按車款分組統計的就是一個離散元素的聚合統計了:

POST /cartxns/_search
{
  "size" : 1,
  "aggs": {
    "avage price per model" : {
        "terms": {"field" : "make.keyword"},
        "aggs": {
          "average price": {
            "avg": {"field": "price"}
          },
          "max price" : {
            "max": {
              "field": "price"
            }
          },
          "min price" : {
            "min": {
              "field": "price"
            }
          }
          
        }
     }
  }
}

我們可以得到每一款車的平均售價、最低最高售價:

  "aggregations" : {
    "avage price per model" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "honda",
          "doc_count" : 3,
          "max price" : {
            "value" : 20000.0
          },
          "average price" : {
            "value" : 16666.666666666668
          },
          "min price" : {
            "value" : 10000.0
          }
        },
        {
          "key" : "ford",
          "doc_count" : 2,
          "max price" : {
            "value" : 30000.0
          },
          "average price" : {
            "value" : 27500.0
          },
          "min price" : {
            "value" : 25000.0
          }
        },
        {
          "key" : "toyota",
          "doc_count" : 2,
          "max price" : {
            "value" : 15000.0
          },
          "average price" : {
            "value" : 13500.0
          },
          "min price" : {
            "value" : 12000.0
          }
        },
        {
          "key" : "bmw",
          "doc_count" : 1,
          "max price" : {
            "value" : 80000.0
          },
          "average price" : {
            "value" : 80000.0
          },
          "min price" : {
            "value" : 80000.0
          }
        }
      ]
    }
  }

elastic4s示範如下:

  val aggDisc = search("cartxns").aggregations(
    termsAgg("prices_per_model","make.keyword").subAggregations(
      avgAgg("average_price","price"),
      minAgg("min_price","price"),
      maxAgg("max_price","price")
    )
  )
  println(aggDisc.show)
  val discResult = client.execute(aggDisc).await

  if (discResult.isSuccess)
    discResult.result.aggregations.terms("prices_per_model").buckets
      .foreach(mb =>
        println(s"${mb.key},${mb.docCount}:${mb.avg("average_price").value}," +
          s"${mb.min("min_price").value.getOrElse(0)}," +
          s"${mb.max("max_price").value.getOrElse(0)}"))
  else println(s"error: ${discResult.error.causedBy.getOrElse("unknown")}")

...

POST:/cartxns/_search?
StringEntity({"aggs":{"prices_per_model":{"terms":{"field":"make.keyword"},"aggs":{"average_price":{"avg":{"field":"price"}},"min_price":{"min":{"field":"price"}},"max_price":{"max":{"field":"price"}}}}}},Some(application/json))
honda,3:16666.666666666668,10000.0,20000.0
ford,2:27500.0,25000.0,30000.0
toyota,2:13500.0,12000.0,15000.0
bmw,1:80000.0,80000.0,80000.0

date_histogram是一種按時間間隔聚合的統計方法。對於按時間趨勢變化的資料分析十分有用:

POST /cartxns/_search
{
   "aggs": {
     "sales_per_month": {
       "date_histogram": {
         "field": "sold",
         "calendar_interval":"1M",
         "format": "yyyy-MM-dd"
       }
     }
   }
}

...

  "aggregations" : {
    "sales_per_month" : {
      "buckets" : [
        {
          "key_as_string" : "2014-01-01",
          "key" : 1388534400000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-02-01",
          "key" : 1391212800000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-03-01",
          "key" : 1393632000000,
          "doc_count" : 0
        },
        {
          "key_as_string" : "2014-04-01",
          "key" : 1396310400000,
          "doc_count" : 0
        },
        {
          "key_as_string" : "2014-05-01",
          "key" : 1398902400000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-06-01",
          "key" : 1401580800000,
          "doc_count" : 0
        },
        {
          "key_as_string" : "2014-07-01",
          "key" : 1404172800000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-08-01",
          "key" : 1406851200000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-09-01",
          "key" : 1409529600000,
          "doc_count" : 0
        },
        {
          "key_as_string" : "2014-10-01",
          "key" : 1412121600000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-11-01",
          "key" : 1414800000000,
          "doc_count" : 2
        }
      ]
    }
  }

上面這個例子產生以月為單元的bucket。elastic4s示範:

  val aggDateHist = search("cartxns").aggregations(
    dateHistogramAggregation("sales_per_month")
      .field("sold")
      .calendarInterval(DateHistogramInterval.Month)
      .format("yyyy-MM-dd")
      .minDocCount(1)
  )
  println(aggDateHist.show)

  val dtHistResult = client.execute(aggDateHist).await

  if (dtHistResult.isSuccess)
    dtHistResult.result.aggregations.dateHistogram("sales_per_month").buckets
        .foreach(db => println(s"${db.date},${db.docCount}"))
  else println(s"error: ${dtHistResult.error.causedBy.getOrElse("unknown")}")

...

POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_month":{"date_histogram":{"calendar_interval":"1M","min_doc_count":1,"format":"yyyy-MM-dd","field":"sold"}}}},Some(application/json))
2014-01-01,1
2014-02-01,1
2014-05-01,1
2014-07-01,1
2014-08-01,1
2014-10-01,1
2014-11-01,2

在以月劃分bucket後可以再進行每個月的深度聚合:

POST /cartxns/_search
{
   "aggs": {
     "sales_per_month": {
       "date_histogram": {
         "field": "sold",
         "calendar_interval":"1M",
         "format": "yyyy-MM-dd"
       },
       "aggs": {
         "per_make_sum": {
           "terms": {
             "field": "make.keyword",
             "size": 10
           },
           "aggs": {
             "sum_price": {
               "sum": {"field": "price"}
             }
           }
         },
         "total_sum": {
           "sum": {
             "field": "price"
           }
         }
       }
     }
   }
}

我們可以得到每個月的銷售總額、每個車款每個月的銷售,如下:

"aggregations" : {
    "sales_per_month" : {
      "buckets" : [
        {
          "key_as_string" : "2014-01-01",
          "key" : 1388534400000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "bmw",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 80000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 80000.0
          }
        },
        {
          "key_as_string" : "2014-02-01",
          "key" : 1391212800000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "ford",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 25000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 25000.0
          }
        },
        {
          "key_as_string" : "2014-03-01",
          "key" : 1393632000000,
          "doc_count" : 0,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [ ]
          },
          "total_sum" : {
            "value" : 0.0
          }
        },
        {
          "key_as_string" : "2014-04-01",
          "key" : 1396310400000,
          "doc_count" : 0,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [ ]
          },
          "total_sum" : {
            "value" : 0.0
          }
        },
        {
          "key_as_string" : "2014-05-01",
          "key" : 1398902400000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "ford",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 30000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 30000.0
          }
        },
        {
          "key_as_string" : "2014-06-01",
          "key" : 1401580800000,
          "doc_count" : 0,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [ ]
          },
          "total_sum" : {
            "value" : 0.0
          }
        },
        {
          "key_as_string" : "2014-07-01",
          "key" : 1404172800000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "toyota",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 15000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 15000.0
          }
        },
        {
          "key_as_string" : "2014-08-01",
          "key" : 1406851200000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "toyota",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 12000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 12000.0
          }
        },
        {
          "key_as_string" : "2014-09-01",
          "key" : 1409529600000,
          "doc_count" : 0,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [ ]
          },
          "total_sum" : {
            "value" : 0.0
          }
        },
        {
          "key_as_string" : "2014-10-01",
          "key" : 1412121600000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "honda",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 10000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 10000.0
          }
        },
        {
          "key_as_string" : "2014-11-01",
          "key" : 1414800000000,
          "doc_count" : 2,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "honda",
                "doc_count" : 2,
                "sum_price" : {
                  "value" : 40000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 40000.0
          }
        }
      ]
    }
  }

用elastic4s可以這樣寫:

  val aggMonthSales= search("cartxns").aggregations(
    dateHistogramAggregation("sales_per_month")
      .field("sold")
      .calendarInterval(DateHistogramInterval.Month)
      .format("yyyy-MM-dd")
      .minDocCount(1).subAggregations(
        termsAgg("month_make","make.keyword").subAggregations(
        sumAggregation("month_total_per_make").field("price")
      ),
      sumAggregation("monthly_total").field("price")
     )
   )

  println(aggMonthSales.show)
  
  val monthSalesResult = client.execute(aggMonthSales).await

  if (monthSalesResult.isSuccess)
     monthSalesResult.result.aggregations.dateHistogram("sales_per_month").buckets
       .foreach { sb =>
       println(s"${sb.date},${sb.docCount},${sb.sum("monthly_total").value}")
       sb.terms("month_make").buckets
        .foreach(mb =>       
        println(s"${mb.key},${mb.docCount},${mb.sum("month_total_per_make").value}"))
     }
  else println(s"error: ${monthSalesResult.error.causedBy.getOrElse("unknown")}")


...

POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_month":{"date_histogram":{"calendar_interval":"1M","min_doc_count":1,"format":"yyyy-MM-dd","field":"sold"},"aggs":{"month_make":{"terms":{"field":"make.keyword"},"aggs":{"month_total_per_make":{"sum":{"field":"price"}}}},"monthly_total":{"sum":{"field":"price"}}}}}},Some(application/json))
2014-01-01,1,80000.0
bmw,1,80000.0
2014-02-01,1,25000.0
ford,1,25000.0
2014-05-01,1,30000.0
ford,1,30000.0
2014-07-01,1,15000.0
toyota,1,15000.0
2014-08-01,1,12000.0
toyota,1,12000.0
2014-10-01,1,10000.0
honda,1,10000.0
2014-11-01,2,40000.0
honda,2,40000.0

 

相關文章