spark與elasticsearch整合

筆尖的痕發表於2016-03-26
    <dependencies>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
            <scope>compile</scope>
        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-compiler</artifactId>
            <version>${scala.version}</version>
            <scope>compile</scope>
        </dependency>
        <!--<dependency>-->
            <!--<groupId>org.specs2</groupId>-->
            <!--<artifactId>specs2_${scala.binary.version}</artifactId>-->
            <!--<version>3.3.1</version>-->
            <!--<scope>test</scope>-->
        <!--</dependency>-->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch-hadoop</artifactId>
            <version>${elasticsearch.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>org.apache.spark</groupId>
                    <artifactId>spark-core_2.10</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.spark</groupId>
                    <artifactId>spark-sql_2.10</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.storm</groupId>
                    <artifactId>storm-core</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>cascading</groupId>
                    <artifactId>cascading-hadoop</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>2.8.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-pool2</artifactId>
            <version>2.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.9.0.1</version>
        </dependency>
        <dependency>
            <groupId>org.codehaus.jettison</groupId>
            <artifactId>jettison</artifactId>
            <version>1.3.7</version>
        </dependency>
    </dependencies>


demo1:

package demo.spark.elasticsearch

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._

import org.elasticsearch.spark._

/**
  * Created by cao on 16-3-25.
  */
object Demo1 {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("ESDemo1")
    conf.set("es.index.auto.create", "true")
    val sc = new SparkContext(conf)

    val numbers = Map("one" -> 1, "two" -> 2, "three" -> 3)
    val airports = Map("arrival" -> "Otopeni", "SFO" -> "San Fran")

    sc.makeRDD(Seq(numbers,airports)).saveToEs("spark/docs")
  }
}
{"took":2,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":12,"max_score":1.0,"hits":[{"_index":"spark","_type":"docs","_id":"AVOukOOI0OVBGh8ft4am","_score":1.0,"_source":{"one":1,"two":2,"three":3}},{"_index":"spark","_type":"docs","_id":"AVOu-vRa0OVBGh8ft4a9","_score":1.0,"_source":{"one":1,"two":2,"three":3}},{"_index":"spark","_type":"docs","_id":"AVOu_kMq0OVBGh8ft4a_","_score":1.0,"_source":{"departure":"MUC","arrival":"OTP"}},{"_index":"spark","_type":"docs","_id":"AVOvAVuS0OVBGh8ft4bE","_score":1.0,"_source":{"one":1,"two":2,"three":3}},{"_index":"spark","_type":"docs","_id":"AVOujInV0OVBGh8ft4aj","_score":1.0,"_source":{"arrival":"Otopeni","SFO":"San Fran"}},{"_index":"spark","_type":"docs","_id":"AVOujInn0OVBGh8ft4ak","_score":1.0,"_source":{"one":1,"two":2,"three":3}},{"_index":"spark","_type":"docs","_id":"AVOumniH0OVBGh8ft4as","_score":1.0,"_source":{"departure":"MUC","arrival":"OTP"}},{"_index":"spark","_type":"docs","_id":"AVOumniH0OVBGh8ft4at","_score":1.0,"_source":{"departure":"OTP","arrival":"SFO"}},{"_index":"spark","_type":"docs","_id":"AVOu_kMq0OVBGh8ft4a-","_score":1.0,"_source":{"departure":"OTP","arrival":"SFO"}},{"_index":"spark","_type":"docs","_id":"AVOvAVuJ0OVBGh8ft4bD","_score":1.0,"_source":{"arrival":"Otopeni","SFO":"San Fran"}}]}}


Dmo2
package demo.spark.elasticsearch

/**
  * Created by cao on 16-3-26.
  */

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLContext._
import org.elasticsearch.spark.rdd.EsSpark
import org.elasticsearch.spark.sql._
import org.apache.spark.rdd.RDD._
import org.elasticsearch.spark._


object Demo2 {
  def main(args: Array[String]) {

    val sc = new SparkContext(new SparkConf().setAppName("Demo2"))

    case class Trip(departure: String, arrival: String)

    val upcomingTrip = Trip("OTP", "SFO")
    val lastWeekTrip = Trip("MUC", "OTP")

    val rdd = sc.makeRDD(Seq(upcomingTrip, lastWeekTrip))
    EsSpark.saveToEs(rdd, "spark/docs")
  }
}


Demo3

package demo.spark.elasticsearch

/**
  * Created by cao on 16-3-26.
  */

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.elasticsearch.spark.sql._
import org.apache.spark.rdd.RDD._
import org.elasticsearch.spark._


//定義Person case class
case class Person(name: String, surname: String, age: Int)

object Demo3 {
  def main(args: Array[String]) {

    val sc = new SparkContext(new SparkConf().setAppName("Demo4"))
    //建立sqlContext
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    //建立DataFrame
    val people = sc.textFile("file:///home/cao/Desktop/poeple.txt").map(_.split(",")).map(p => Person(p(0), p(1), p(2).trim.toInt)).toDF()

    people.saveToEs("spark/people")
  }
}

Demo4

package demo.spark.elasticsearch

/**
  * Created by cao on 16-3-26.
  */

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.elasticsearch.spark.sql._
import org.apache.spark.rdd.RDD._
import org.elasticsearch.spark._


object Demo4 {
  def main(args: Array[String]) {

    val sc = new SparkContext(new SparkConf().setAppName("Demo4"))
    //建立sqlContext
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val options = Map("pushdown" -> "true", "es.nodes" -> "localhost", "es.port" -> "9200")

    val spark14DF = sqlContext.read.format("org.elasticsearch.spark.sql").options(options).load("spark/people")

    spark14DF.select("name","age").collect().foreach(println(_))

    spark14DF.registerTempTable("people")
    val results = sqlContext.sql("SELECT name FROM people")
    results.map(t => "Name:"+t(0)).collect().foreach(println)
  }
}

Demo5
package demo.spark.elasticsearch

/**
  * Created by cao on 16-3-26.
  */

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}


object Demo5 {
  def main(args: Array[String]) {

    val sc = new SparkContext(new SparkConf().setAppName("Demo4"))
    //建立sqlContext
    val sqlContext = new SQLContext(sc)

    sqlContext.sql(
      "CREATE TEMPORARY TABLE myPeople    " +
        "USING org.elasticsearch.spark.sql " +
        "OPTIONS ( resource 'spark/people', nodes 'localhost:9200')" )

    sqlContext.sql("select * from myPeople").collect.foreach(println)
  }
}

Demo6

package demo.spark.elasticsearch

/**
  * Created by cao on 16-3-26.
  */

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.spark.sql._


object Demo6 {
  def main(args: Array[String]) {

    val sc = new SparkContext(new SparkConf().setAppName("Demo4"))
    //建立sqlContext
    val sqlContext = new SQLContext(sc)

    val people = sqlContext.esDF("spark/people")
    println(people.schema.treeString)

    val wangs = sqlContext.esDF("spark/people","?q=wang")
    wangs.show()
  }
}


相關文章