使用scala編寫wordcount程式--從多個檔案中讀取計算,最終存入MySQL資料庫

loveheping發表於2018-02-12
   啥也不說,直接上程式吧,嘻嘻。。。。。。。。
一、公共類
點選(此處)摺疊或開啟
  1. package com.ruozedata.scalawroks

  2. import java.io.{FileSystem => _, _}
  3. import org.apache.hadoop.fs._
  4. import scala.collection.mutable.ListBuffer
  5. import org.apache.hadoop.fs.{FileSystem, Path}

  6. object ScalaToHDFSUtils {

  7.   def isDir(fs : FileSystem, dirName : String) : Boolean = {
  8.     fs.isDirectory(new Path(dirName))
  9.   }

  10.   def isDir(fs : FileSystem, dirName : Path) : Boolean = {
  11.     fs.isDirectory(dirName)
  12.   }

  13.   def isFile(fs : FileSystem, fileName : String) : Boolean = {
  14.     fs.isFile(new Path(fileName))
  15.   }

  16.   def isFile(fs : FileSystem, fileName : Path) : Boolean = {
  17.     fs.isFile(fileName)
  18.   }

  19.   def createFile(fs : FileSystem, fileName : String) : Boolean = {
  20.     fs.createNewFile(new Path(fileName))
  21.   }

  22.   def createFile(fs : FileSystem, name : Path) : Boolean = {
  23.     fs.createNewFile(name)
  24.   }

  25.   def createFolder(fs : FileSystem, folderName : String) : Boolean = {
  26.     fs.mkdirs(new Path(folderName))
  27.   }

  28.   def createFolder(fs : FileSystem, folderName : Path) : Boolean = {
  29.     fs.mkdirs(folderName)
  30.   }

  31.   def exists(fs : FileSystem, name : String) : Boolean = {
  32.     fs.exists(new Path(name))
  33.   }

  34.   def exists(fs : FileSystem, name : Path) : Boolean = {
  35.     fs.exists(name)
  36.   }

  37.   def rename(fs : FileSystem, srcPath : String, distPath : String) = {
  38.     fs.rename( new Path( srcPath ), new Path( distPath ) )
  39.   }

  40.   def move(fs : FileSystem, srcPath : String, distPath : String) = {
  41.     fs.rename( new Path( srcPath ), new Path( distPath ) )
  42.   }

  43.   class MyPathFilter extends PathFilter {
  44.     override def accept(path: Path): Boolean = true
  45.   }

  46.   def listFiles(fs : FileSystem, fullName : String, holder : ListBuffer[String],
  47.                 flgFile : Boolean = false) : ListBuffer[String] = {

  48.     val filesStatus = fs.listStatus(new Path(fullName), new MyPathFilter)

  49.     for(status <- filesStatus){
  50.       val filePath : Path = status.getPath
  51.       if(isFile(fs,filePath))
  52.         holder += filePath.toString
  53.       else {
  54.         if(flgFile==true)
  55.           listFiles(fs, filePath.toString, holder)
  56.       }
  57.     }
  58.     holder
  59.   }

  60.   def copyFile(fs : FileSystem, source: String, target: String): Unit = {

  61.     val sourcePath = new Path(source)
  62.     val targetPath = new Path(target)

  63.     if(!exists(fs, targetPath))
  64.       createFile(fs, targetPath)

  65.     val inputStream : FSDataInputStream = fs.open(sourcePath)
  66.     val outputStream : FSDataOutputStream = fs.create(targetPath)
  67.     transport(inputStream, outputStream)
  68.   }

  69.   def transport(inputStream : InputStream, outputStream : OutputStream): Unit ={
  70.     val buffer = new Array[Byte](64 * 1000)
  71.     var len = inputStream.read(buffer)
  72.     try {
  73.       while (len != -1) {
  74.         outputStream.write(buffer, 0, len)
  75.         len = inputStream.read(buffer)
  76.       }
  77.       outputStream.flush()
  78.     } catch {
  79.       case e : IOException => {
  80.         e.printStackTrace
  81.       }
  82.     } finally {
  83.       inputStream.close()
  84.       outputStream.close()
  85.     }
  86.   }

  87.   def readFiles(fs : FileSystem, source: String, values: ListBuffer[String]) : ListBuffer[String] = {
  88.     val sourcePath = new Path(source)
  89.     val inputStream : FSDataInputStream = fs.open(sourcePath)
  90.     var bufferedReader: BufferedReader = new BufferedReader(new InputStreamReader(inputStream))
  91.     var lineTxt : String = bufferedReader.readLine()

  92.     try {
  93.       while (lineTxt != null) {
  94.         values.append(lineTxt)
  95.         lineTxt = bufferedReader.readLine()
  96.       }
  97.       values
  98.     } finally {
  99.       if (bufferedReader != null) {
  100.         bufferedReader.close()
  101.       }
  102.       if (inputStream != null) {
  103.         inputStream.close()
  104.       }
  105.     }
  106.   }
  107. }
二、移動檔案類
點選(此處)摺疊或開啟
  1. package com.ruozedata.scalawroks

  2. import java.io.IOException
  3. import java.net.URI
  4. import java.text.SimpleDateFormat
  5. import java.util.Date

  6. import org.apache.hadoop.fs._
  7. import org.apache.hadoop.conf.Configuration

  8. import scala.collection.mutable.ListBuffer

  9.   object ScalaToHDFSApp extends App {
  10.     val conf = new Configuration()
  11.     var fs: FileSystem = null

  12.     val sourcePath = "/ruozedata/" // 打包jar時,可以取args(0)值
  13.     val targetPath = "/ruozedata/scala/" // 打包jar時,可以取args(1)值
  14.     val uriPath = "hdfs://192.168.137.130:9000/" // 打包jar時,可以取args(2)值

  15.     // 新檔名
  16.     var newFileName = ""

  17.     // 檔案序號
  18.     var fSeq = 1

  19.     // 設定訪問HDFS使用者
  20.     System.setProperty("HADOOP_USER_NAME", "hadoop")

  21.     try {
  22.       fs = FileSystem.get(new URI(uriPath), conf)

  23.       val holder : ListBuffer[String] = new ListBuffer[String]

  24.       // 獲指定目錄中檔案列表(如果為true則級聯查詢檔案,如果為false則只查詢當前目錄檔案)。預設值:false
  25.       val paths : List[String] = ScalaToHDFSUtils.listFiles(fs, sourcePath, holder).toList

  26.       for(path <- paths) {
  27.         // 拼接新檔名
  28.         newFileName = dateToStr + fSeq.toString + ".txt"
  29.         // 檔案序號+1
  30.         fSeq += 1
  31.         // 複製檔案到指定目錄
  32.         ScalaToHDFSUtils.copyFile(fs, sourcePath + new Path(path).getName, targetPath + newFileName)
  33.       }
  34.     } catch {
  35.       case e : IOException => {
  36.         e.printStackTrace
  37.       }
  38.     } finally {
  39.      if (fs != null) {
  40.        fs.close()
  41.      }
  42.   }

  43.   def dateToStr(): String = {
  44.     // 獲取系統日期時間,轉為字串年月日時分格式
  45.     val date = new Date
  46.     val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyyMMddHHmm")
  47.     var newTmpFileName = dateFormat.format(date) + "-"
  48.     newTmpFileName
  49.   }
  50. }
三、讀取檔案,並計算,最張存到MyQL資料庫中

點選(此處)摺疊或開啟

  1. package com.ruozedata.scalawroks

  2. import java.io.IOException
  3. import java.net.URI
  4. import java.sql.{Connection, DriverManager}

  5. import org.apache.hadoop.conf.Configuration
  6. import org.apache.hadoop.fs.{FileSystem, Path}

  7. import scala.collection.mutable.ListBuffer

  8. object ScalacReadHDFSApp extends App {
  9.   val conf = new Configuration()
  10.   val sourcePath = "/ruozedata/scala/" // 打包jar時,可以取args(0)值
  11.   val uriPath = "hdfs://192.168.137.130:9000/" // 打包jar時,可以取args(1)值
  12.   val holder: ListBuffer[String] = new ListBuffer[String]
  13.   val readHolder: ListBuffer[String] = new ListBuffer[String]

  14.   var fs: FileSystem = null
  15.   var listBuffer: List[String] = null

  16.   // 設定訪問HDFS使用者
  17.   System.setProperty("HADOOP_USER_NAME", "hadoop")

  18.   try {
  19.     fs = FileSystem.get(new URI(uriPath), conf)

  20.     // 獲指定目錄中檔案列表(如果為true則級聯查詢檔案,如果為false則只查詢當前目錄檔案)。預設值:false
  21.     val paths : List[String] = ScalaToHDFSUtils.listFiles(fs, sourcePath, holder).toList

  22.     for(path <- paths) {
  23.       listBuffer = ScalaToHDFSUtils.readFiles(fs, sourcePath + new Path(path).getName, readHolder).toList
  24.     }

  25.     // 完成WordCount統計
  26.     val wc = listBuffer.flatMap(_.split("\t")).map((_, 1)).groupBy(_._1).mapValues(_.map(_._2).reduce(_ + _))

  27.     // 呼叫存入MySQL中
  28.     insertMySQL(wc)
  29.   } catch {
  30.     case e: IOException => {
  31.       e.printStackTrace
  32.     }
  33.   }finally {
  34.     if (fs != null) {
  35.       fs.close()
  36.     }
  37.   }

  38.   def insertMySQL(wc : Map[String,Int]): Unit = {
  39.     // connect to the database named "mysql" on the localhost
  40.     val driver: String = "com.mysql.jdbc.Driver"
  41.     val url = "jdbc:mysql://192.168.137.130:3306/works"
  42.     val username = "root"
  43.     val password = "123456"

  44.     var connection : Connection = null
  45.     var query = ""

  46.     try{
  47.       //Class.forName(driver)
  48.       connection = DriverManager.getConnection(url, username, password)
  49.       val ps = connection.createStatement()

  50.       for ((key, value) <- wc) {
  51.         query="insert into wordcounts(words, counts) values('"+key+"' , '"+value+"')"
  52.         ps.addBatch(query)
  53.       }

  54.       ps.executeBatch()
  55.     } catch {
  56.       case e => e.printStackTrace
  57.     } finally {
  58.       connection.close()
  59.     }
  60.   }
  61. }
四、MySQL表建立
點選(此處)摺疊或開啟
  1. create table wordcounts(words varchar(50), counts varchar(20));
五、Mavne配置(pom.xml)
點選(此處)摺疊或開啟
  1. <project xmlns="" xmlns:xsi="" xsi:schemaLocation=" ">
  2.   <modelVersion>4.0.0</modelVersion>
  3.   <groupId>testScala</groupId>
  4.   <artifactId>testScala</artifactId>
  5.   <version>1.0</version>
  6.   <inceptionYear>2008</inceptionYear>
  7.   <properties>
  8.     <scala.version>2.11.8</scala.version>
  9.     <hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
  10.     <spark.version>2.2.1</spark.version>
  11.     <mysql.version>5.1.25</mysql.version>
  12.   </properties>

  13.   <repositories>
  14.     <repository>
  15.       <id>scala-tools.org</id>
  16.       <name>Scala-Tools Maven2 Repository</name>
  17.       <url>http://scala-tools.org/repo-releases</url>
  18.     </repository>

  19.     <repository>
  20.       <id>cloudera</id>
  21.       <name>cloudera</name>
  22.       <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
  23.     </repository>
  24.   </repositories>

  25.   <pluginRepositories>
  26.     <pluginRepository>
  27.       <id>scala-tools.org</id>
  28.       <name>Scala-Tools Maven2 Repository</name>
  29.       <url>http://scala-tools.org/repo-releases</url>
  30.     </pluginRepository>
  31.   </pluginRepositories>

  32.   <dependencies>
  33.     <dependency>
  34.       <groupId>org.apache.spark</groupId>
  35.       <artifactId>spark-core_2.11</artifactId>
  36.       <version>${spark.version}</version>
  37.     </dependency>
  38.     <dependency>
  39.       <groupId>org.scala-lang</groupId>
  40.       <artifactId>scala-library</artifactId>
  41.       <version>${scala.version}</version>
  42.     </dependency>
  43.     <dependency>
  44.       <groupId>junit</groupId>
  45.       <artifactId>junit</artifactId>
  46.       <version>4.4</version>
  47.       <scope>test</scope>
  48.     </dependency>
  49.     <dependency>
  50.       <groupId>org.specs</groupId>
  51.       <artifactId>specs</artifactId>
  52.       <version>1.2.5</version>
  53.       <scope>test</scope>
  54.     </dependency>

  55.     <dependency>
  56.       <groupId>org.apache.hadoop</groupId>
  57.       <artifactId>hadoop-client</artifactId>
  58.       <version>${hadoop.version}</version>
  59.     </dependency>

  60.     <dependency>
  61.       <groupId>mysql</groupId>
  62.       <artifactId>mysql-connector-java</artifactId>
  63.       <version>${mysql.version}</version>
  64.     </dependency>
  65.   </dependencies>

  66.   <build>
  67.     <sourceDirectory>src/main/scala</sourceDirectory>
  68.     <testSourceDirectory>src/test/scala</testSourceDirectory>
  69.     <plugins>
  70.       <plugin>
  71.         <groupId>org.scala-tools</groupId>
  72.         <artifactId>maven-scala-plugin</artifactId>
  73.         <executions>
  74.           <execution>
  75.             <goals>
  76.               <goal>compile</goal>
  77.               <goal>testCompile</goal>
  78.             </goals>
  79.           </execution>
  80.         </executions>
  81.         <configuration>
  82.           <scalaVersion>${scala.version}</scalaVersion>
  83.           <args>
  84.             <arg>-target:jvm-1.5</arg>
  85.           </args>
  86.         </configuration>
  87.       </plugin>
  88.       <plugin>
  89.         <groupId>org.apache.maven.plugins</groupId>
  90.         <artifactId>maven-eclipse-plugin</artifactId>
  91.         <configuration>
  92.           <downloadSources>true</downloadSources>
  93.           <buildcommands>
  94.             <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
  95.           </buildcommands>
  96.           <additionalProjectnatures>
  97.             <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
  98.           </additionalProjectnatures>
  99.           <classpathContainers>
  100.             <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
  101.             <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
  102.           </classpathContainers>
  103.         </configuration>
  104.       </plugin>
  105.     </plugins>
  106.   </build>
  107.   <reporting>
  108.     <plugins>
  109.       <plugin>
  110.         <groupId>org.scala-tools</groupId>
  111.         <artifactId>maven-scala-plugin</artifactId>
  112.         <configuration>
  113.           <scalaVersion>${scala.version}</scalaVersion>
  114.         </configuration>
  115.       </plugin>
  116.     </plugins>
  117.   </reporting>
  118. </project>
六、測試資料
hdfs dfs -ls /ruozedata/scala
Found 3 items
-rw-r--r--   3 hadoop supergroup         24 2018-02-12 14:48 /ruozedata/scala/201802121449-1.txt
-rw-r--r--   3 hadoop supergroup         12 2018-02-12 14:48 /ruozedata/scala/201802121449-2.txt
-rw-r--r--   3 hadoop supergroup         12 2018-02-12 14:48 /ruozedata/scala/201802121449-3.txt
-----------------------------------------------------------------------------------------------------------
hdfs dfs -cat /ruozedata/scala/201802121449-1.txt
hello   world
hello   worls

hdfs dfs -cat /ruozedata/scala/201802121449-2.txt
hello   ruoze

hdfs dfs -cat /ruozedata/scala/201802121449-3.txt
ruoze   hello

七、測試結果


注:後續大家有什麼好的建議,請提出。


來自@若澤大資料



來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/31511218/viewspace-2151064/,如需轉載,請註明出處,否則將追究法律責任。

相關文章