啥也不說，直接上程式吧，嘻嘻。。。。。。。。
一、公共類
點選(此處)摺疊或開啟

package com.ruozedata.scalawroks
import java.io.{FileSystem => _, _}
import org.apache.hadoop.fs._
import scala.collection.mutable.ListBuffer
import org.apache.hadoop.fs.{FileSystem, Path}
object ScalaToHDFSUtils {
def isDir(fs : FileSystem, dirName : String) : Boolean = {
fs.isDirectory(new Path(dirName))
}
def isDir(fs : FileSystem, dirName : Path) : Boolean = {
fs.isDirectory(dirName)
}
def isFile(fs : FileSystem, fileName : String) : Boolean = {
fs.isFile(new Path(fileName))
}
def isFile(fs : FileSystem, fileName : Path) : Boolean = {
fs.isFile(fileName)
}
def createFile(fs : FileSystem, fileName : String) : Boolean = {
fs.createNewFile(new Path(fileName))
}
def createFile(fs : FileSystem, name : Path) : Boolean = {
fs.createNewFile(name)
}
def createFolder(fs : FileSystem, folderName : String) : Boolean = {
fs.mkdirs(new Path(folderName))
}
def createFolder(fs : FileSystem, folderName : Path) : Boolean = {
fs.mkdirs(folderName)
}
def exists(fs : FileSystem, name : String) : Boolean = {
fs.exists(new Path(name))
}
def exists(fs : FileSystem, name : Path) : Boolean = {
fs.exists(name)
}
def rename(fs : FileSystem, srcPath : String, distPath : String) = {
fs.rename( new Path( srcPath ), new Path( distPath ) )
}
def move(fs : FileSystem, srcPath : String, distPath : String) = {
fs.rename( new Path( srcPath ), new Path( distPath ) )
}
class MyPathFilter extends PathFilter {
override def accept(path: Path): Boolean = true
}
def listFiles(fs : FileSystem, fullName : String, holder : ListBuffer[String],
flgFile : Boolean = false) : ListBuffer[String] = {
val filesStatus = fs.listStatus(new Path(fullName), new MyPathFilter)
for(status <- filesStatus){
val filePath : Path = status.getPath
if(isFile(fs,filePath))
holder += filePath.toString
else {
if(flgFile==true)
listFiles(fs, filePath.toString, holder)
}
}
holder
}
def copyFile(fs : FileSystem, source: String, target: String): Unit = {
val sourcePath = new Path(source)
val targetPath = new Path(target)
if(!exists(fs, targetPath))
createFile(fs, targetPath)
val inputStream : FSDataInputStream = fs.open(sourcePath)
val outputStream : FSDataOutputStream = fs.create(targetPath)
transport(inputStream, outputStream)
}
def transport(inputStream : InputStream, outputStream : OutputStream): Unit ={
val buffer = new Array[Byte](64 * 1000)
var len = inputStream.read(buffer)
try {
while (len != -1) {
outputStream.write(buffer, 0, len)
len = inputStream.read(buffer)
}
outputStream.flush()
} catch {
case e : IOException => {
e.printStackTrace
}
} finally {
inputStream.close()
outputStream.close()
}
}
def readFiles(fs : FileSystem, source: String, values: ListBuffer[String]) : ListBuffer[String] = {
val sourcePath = new Path(source)
val inputStream : FSDataInputStream = fs.open(sourcePath)
var bufferedReader: BufferedReader = new BufferedReader(new InputStreamReader(inputStream))
var lineTxt : String = bufferedReader.readLine()
try {
while (lineTxt != null) {
values.append(lineTxt)
lineTxt = bufferedReader.readLine()
}
values
} finally {
if (bufferedReader != null) {
bufferedReader.close()
}
if (inputStream != null) {
inputStream.close()
}
}
}
}

二、移動檔案類
點選(此處)摺疊或開啟

package com.ruozedata.scalawroks
import java.io.IOException
import java.net.URI
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.hadoop.fs._
import org.apache.hadoop.conf.Configuration
import scala.collection.mutable.ListBuffer
object ScalaToHDFSApp extends App {
val conf = new Configuration()
var fs: FileSystem = null
val sourcePath = "/ruozedata/" // 打包jar時，可以取args(0)值
val targetPath = "/ruozedata/scala/" // 打包jar時，可以取args(1)值
val uriPath = "hdfs://192.168.137.130:9000/" // 打包jar時，可以取args(2)值
// 新檔名
var newFileName = ""
// 檔案序號
var fSeq = 1
// 設定訪問HDFS使用者
System.setProperty("HADOOP_USER_NAME", "hadoop")
try {
fs = FileSystem.get(new URI(uriPath), conf)
val holder : ListBuffer[String] = new ListBuffer[String]
// 獲指定目錄中檔案列表(如果為true則級聯查詢檔案,如果為false則只查詢當前目錄檔案)。預設值：false
val paths : List[String] = ScalaToHDFSUtils.listFiles(fs, sourcePath, holder).toList
for(path <- paths) {
// 拼接新檔名
newFileName = dateToStr + fSeq.toString + ".txt"
// 檔案序號+1
fSeq += 1
// 複製檔案到指定目錄
ScalaToHDFSUtils.copyFile(fs, sourcePath + new Path(path).getName, targetPath + newFileName)
}
} catch {
case e : IOException => {
e.printStackTrace
}
} finally {
if (fs != null) {
fs.close()
}
}
def dateToStr(): String = {
// 獲取系統日期時間，轉為字串年月日時分格式
val date = new Date
val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyyMMddHHmm")
var newTmpFileName = dateFormat.format(date) + "-"
newTmpFileName
}
}

三、讀取檔案，並計算，最張存到MyQL資料庫中

點選(此處)摺疊或開啟

package com.ruozedata.scalawroks
import java.io.IOException
import java.net.URI
import java.sql.{Connection, DriverManager}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import scala.collection.mutable.ListBuffer
object ScalacReadHDFSApp extends App {
val conf = new Configuration()
val sourcePath = "/ruozedata/scala/" // 打包jar時，可以取args(0)值
val uriPath = "hdfs://192.168.137.130:9000/" // 打包jar時，可以取args(1)值
val holder: ListBuffer[String] = new ListBuffer[String]
val readHolder: ListBuffer[String] = new ListBuffer[String]
var fs: FileSystem = null
var listBuffer: List[String] = null
// 設定訪問HDFS使用者
System.setProperty("HADOOP_USER_NAME", "hadoop")
try {
fs = FileSystem.get(new URI(uriPath), conf)
// 獲指定目錄中檔案列表(如果為true則級聯查詢檔案,如果為false則只查詢當前目錄檔案)。預設值：false
val paths : List[String] = ScalaToHDFSUtils.listFiles(fs, sourcePath, holder).toList
for(path <- paths) {
listBuffer = ScalaToHDFSUtils.readFiles(fs, sourcePath + new Path(path).getName, readHolder).toList
}
// 完成WordCount統計
val wc = listBuffer.flatMap(_.split("\t")).map((_, 1)).groupBy(_._1).mapValues(_.map(_._2).reduce(_ + _))
// 呼叫存入MySQL中
insertMySQL(wc)
} catch {
case e: IOException => {
e.printStackTrace
}
}finally {
if (fs != null) {
fs.close()
}
}
def insertMySQL(wc : Map[String,Int]): Unit = {
// connect to the database named "mysql" on the localhost
val driver: String = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://192.168.137.130:3306/works"
val username = "root"
val password = "123456"
var connection : Connection = null
var query = ""
try{
//Class.forName(driver)
connection = DriverManager.getConnection(url, username, password)
val ps = connection.createStatement()
for ((key, value) <- wc) {
query="insert into wordcounts(words, counts) values('"+key+"' , '"+value+"')"
ps.addBatch(query)
}
ps.executeBatch()
} catch {
case e => e.printStackTrace
} finally {
connection.close()
}
}
}

四、MySQL表建立
點選(此處)摺疊或開啟

create table wordcounts(words varchar(50), counts varchar(20));

五、Mavne配置(pom.xml)
點選(此處)摺疊或開啟

<project xmlns="" xmlns:xsi="" xsi:schemaLocation=" ">
<modelVersion>4.0.0</modelVersion>
<groupId>testScala</groupId>
<artifactId>testScala</artifactId>
<version>1.0</version>
<inceptionYear>2008</inceptionYear>
<properties>
<scala.version>2.11.8</scala.version>
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
<spark.version>2.2.1</spark.version>
<mysql.version>5.1.25</mysql.version>
</properties>
<repositories>
<repository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</repository>
<repository>
<id>cloudera</id>
<name>cloudera</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.specs</groupId>
<artifactId>specs</artifactId>
<version>1.2.5</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.5</arg>
</args>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-eclipse-plugin</artifactId>
<configuration>
<downloadSources>true</downloadSources>
<buildcommands>
<buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
</buildcommands>
<additionalProjectnatures>
<projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
</additionalProjectnatures>
<classpathContainers>
<classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
<classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
</classpathContainers>
</configuration>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</reporting>
</project>

六、測試資料
hdfs dfs -ls /ruozedata/scala
Found 3 items
-rw-r--r-- 3 hadoop supergroup 24 2018-02-12 14:48 /ruozedata/scala/201802121449-1.txt
-rw-r--r-- 3 hadoop supergroup 12 2018-02-12 14:48 /ruozedata/scala/201802121449-2.txt
-rw-r--r-- 3 hadoop supergroup 12 2018-02-12 14:48 /ruozedata/scala/201802121449-3.txt
-----------------------------------------------------------------------------------------------------------
hdfs dfs -cat /ruozedata/scala/201802121449-1.txt
hello world
hello worls

hdfs dfs -cat /ruozedata/scala/201802121449-2.txt
hello ruoze

hdfs dfs -cat /ruozedata/scala/201802121449-3.txt
ruoze hello

七、測試結果

注：後續大家有什麼好的建議，請提出。

來自@若澤大資料

使用scala編寫wordcount程式--從多個檔案中讀取計算，最終存入MySQL資料庫

相關文章