使用scala編寫wordcount程式--從多個檔案中讀取計算,最終存入MySQL資料庫
啥也不說,直接上程式吧,嘻嘻。。。。。。。。
一、公共類
點選(此處)摺疊或開啟
二、移動檔案類
點選(此處)摺疊或開啟
三、讀取檔案,並計算,最張存到MyQL資料庫中
四、MySQL表建立
點選(此處)摺疊或開啟
五、Mavne配置(pom.xml)
點選(此處)摺疊或開啟
六、測試資料
hdfs dfs -ls /ruozedata/scala
Found 3 items
-rw-r--r-- 3 hadoop supergroup 24 2018-02-12 14:48 /ruozedata/scala/201802121449-1.txt
-rw-r--r-- 3 hadoop supergroup 12 2018-02-12 14:48 /ruozedata/scala/201802121449-2.txt
-rw-r--r-- 3 hadoop supergroup 12 2018-02-12 14:48 /ruozedata/scala/201802121449-3.txt
-----------------------------------------------------------------------------------------------------------
hdfs dfs -cat /ruozedata/scala/201802121449-1.txt
hello world
hello worls
hdfs dfs -cat /ruozedata/scala/201802121449-2.txt
hello ruoze
hdfs dfs -cat /ruozedata/scala/201802121449-3.txt
ruoze hello
七、測試結果
注:後續大家有什麼好的建議,請提出。
來自@若澤大資料
一、公共類
點選(此處)摺疊或開啟
-
package com.ruozedata.scalawroks
-
-
import java.io.{FileSystem => _, _}
-
import org.apache.hadoop.fs._
-
import scala.collection.mutable.ListBuffer
-
import org.apache.hadoop.fs.{FileSystem, Path}
-
-
object ScalaToHDFSUtils {
-
-
def isDir(fs : FileSystem, dirName : String) : Boolean = {
-
fs.isDirectory(new Path(dirName))
-
}
-
-
def isDir(fs : FileSystem, dirName : Path) : Boolean = {
-
fs.isDirectory(dirName)
-
}
-
-
def isFile(fs : FileSystem, fileName : String) : Boolean = {
-
fs.isFile(new Path(fileName))
-
}
-
-
def isFile(fs : FileSystem, fileName : Path) : Boolean = {
-
fs.isFile(fileName)
-
}
-
-
def createFile(fs : FileSystem, fileName : String) : Boolean = {
-
fs.createNewFile(new Path(fileName))
-
}
-
-
def createFile(fs : FileSystem, name : Path) : Boolean = {
-
fs.createNewFile(name)
-
}
-
-
def createFolder(fs : FileSystem, folderName : String) : Boolean = {
-
fs.mkdirs(new Path(folderName))
-
}
-
-
def createFolder(fs : FileSystem, folderName : Path) : Boolean = {
-
fs.mkdirs(folderName)
-
}
-
-
def exists(fs : FileSystem, name : String) : Boolean = {
-
fs.exists(new Path(name))
-
}
-
-
def exists(fs : FileSystem, name : Path) : Boolean = {
-
fs.exists(name)
-
}
-
-
def rename(fs : FileSystem, srcPath : String, distPath : String) = {
-
fs.rename( new Path( srcPath ), new Path( distPath ) )
-
}
-
-
def move(fs : FileSystem, srcPath : String, distPath : String) = {
-
fs.rename( new Path( srcPath ), new Path( distPath ) )
-
}
-
-
class MyPathFilter extends PathFilter {
-
override def accept(path: Path): Boolean = true
-
}
-
-
def listFiles(fs : FileSystem, fullName : String, holder : ListBuffer[String],
-
flgFile : Boolean = false) : ListBuffer[String] = {
-
-
val filesStatus = fs.listStatus(new Path(fullName), new MyPathFilter)
-
-
for(status <- filesStatus){
-
val filePath : Path = status.getPath
-
if(isFile(fs,filePath))
-
holder += filePath.toString
-
else {
-
if(flgFile==true)
-
listFiles(fs, filePath.toString, holder)
-
}
-
}
-
holder
-
}
-
-
def copyFile(fs : FileSystem, source: String, target: String): Unit = {
-
-
val sourcePath = new Path(source)
-
val targetPath = new Path(target)
-
-
if(!exists(fs, targetPath))
-
createFile(fs, targetPath)
-
-
val inputStream : FSDataInputStream = fs.open(sourcePath)
-
val outputStream : FSDataOutputStream = fs.create(targetPath)
-
transport(inputStream, outputStream)
-
}
-
-
def transport(inputStream : InputStream, outputStream : OutputStream): Unit ={
-
val buffer = new Array[Byte](64 * 1000)
-
var len = inputStream.read(buffer)
-
try {
-
while (len != -1) {
-
outputStream.write(buffer, 0, len)
-
len = inputStream.read(buffer)
-
}
-
outputStream.flush()
-
} catch {
-
case e : IOException => {
-
e.printStackTrace
-
}
-
} finally {
-
inputStream.close()
-
outputStream.close()
-
}
-
}
-
-
def readFiles(fs : FileSystem, source: String, values: ListBuffer[String]) : ListBuffer[String] = {
-
val sourcePath = new Path(source)
-
val inputStream : FSDataInputStream = fs.open(sourcePath)
-
var bufferedReader: BufferedReader = new BufferedReader(new InputStreamReader(inputStream))
-
var lineTxt : String = bufferedReader.readLine()
-
-
try {
-
while (lineTxt != null) {
-
values.append(lineTxt)
-
lineTxt = bufferedReader.readLine()
-
}
-
values
-
} finally {
-
if (bufferedReader != null) {
-
bufferedReader.close()
-
}
-
if (inputStream != null) {
-
inputStream.close()
-
}
-
}
-
}
- }
點選(此處)摺疊或開啟
-
package com.ruozedata.scalawroks
-
-
import java.io.IOException
-
import java.net.URI
-
import java.text.SimpleDateFormat
-
import java.util.Date
-
-
import org.apache.hadoop.fs._
-
import org.apache.hadoop.conf.Configuration
-
-
import scala.collection.mutable.ListBuffer
-
-
object ScalaToHDFSApp extends App {
-
val conf = new Configuration()
-
var fs: FileSystem = null
-
-
val sourcePath = "/ruozedata/" // 打包jar時,可以取args(0)值
-
val targetPath = "/ruozedata/scala/" // 打包jar時,可以取args(1)值
-
val uriPath = "hdfs://192.168.137.130:9000/" // 打包jar時,可以取args(2)值
-
-
// 新檔名
-
var newFileName = ""
-
-
// 檔案序號
-
var fSeq = 1
-
-
// 設定訪問HDFS使用者
-
System.setProperty("HADOOP_USER_NAME", "hadoop")
-
-
try {
-
fs = FileSystem.get(new URI(uriPath), conf)
-
-
val holder : ListBuffer[String] = new ListBuffer[String]
-
-
// 獲指定目錄中檔案列表(如果為true則級聯查詢檔案,如果為false則只查詢當前目錄檔案)。預設值:false
-
val paths : List[String] = ScalaToHDFSUtils.listFiles(fs, sourcePath, holder).toList
-
-
for(path <- paths) {
-
// 拼接新檔名
-
newFileName = dateToStr + fSeq.toString + ".txt"
-
// 檔案序號+1
-
fSeq += 1
-
// 複製檔案到指定目錄
-
ScalaToHDFSUtils.copyFile(fs, sourcePath + new Path(path).getName, targetPath + newFileName)
-
}
-
} catch {
-
case e : IOException => {
-
e.printStackTrace
-
}
-
} finally {
-
if (fs != null) {
-
fs.close()
-
}
-
}
-
-
def dateToStr(): String = {
-
// 獲取系統日期時間,轉為字串年月日時分格式
-
val date = new Date
-
val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyyMMddHHmm")
-
var newTmpFileName = dateFormat.format(date) + "-"
-
newTmpFileName
-
}
- }
點選(此處)摺疊或開啟
-
package com.ruozedata.scalawroks
-
-
import java.io.IOException
-
import java.net.URI
-
import java.sql.{Connection, DriverManager}
-
-
import org.apache.hadoop.conf.Configuration
-
import org.apache.hadoop.fs.{FileSystem, Path}
-
-
import scala.collection.mutable.ListBuffer
-
-
object ScalacReadHDFSApp extends App {
-
val conf = new Configuration()
-
val sourcePath = "/ruozedata/scala/" // 打包jar時,可以取args(0)值
-
val uriPath = "hdfs://192.168.137.130:9000/" // 打包jar時,可以取args(1)值
-
val holder: ListBuffer[String] = new ListBuffer[String]
-
val readHolder: ListBuffer[String] = new ListBuffer[String]
-
-
var fs: FileSystem = null
-
var listBuffer: List[String] = null
-
-
// 設定訪問HDFS使用者
-
System.setProperty("HADOOP_USER_NAME", "hadoop")
-
-
try {
-
fs = FileSystem.get(new URI(uriPath), conf)
-
-
// 獲指定目錄中檔案列表(如果為true則級聯查詢檔案,如果為false則只查詢當前目錄檔案)。預設值:false
-
val paths : List[String] = ScalaToHDFSUtils.listFiles(fs, sourcePath, holder).toList
-
-
for(path <- paths) {
-
listBuffer = ScalaToHDFSUtils.readFiles(fs, sourcePath + new Path(path).getName, readHolder).toList
-
}
-
-
// 完成WordCount統計
-
val wc = listBuffer.flatMap(_.split("\t")).map((_, 1)).groupBy(_._1).mapValues(_.map(_._2).reduce(_ + _))
-
-
// 呼叫存入MySQL中
-
insertMySQL(wc)
-
} catch {
-
case e: IOException => {
-
e.printStackTrace
-
}
-
}finally {
-
if (fs != null) {
-
fs.close()
-
}
-
}
-
-
def insertMySQL(wc : Map[String,Int]): Unit = {
-
// connect to the database named "mysql" on the localhost
-
val driver: String = "com.mysql.jdbc.Driver"
-
val url = "jdbc:mysql://192.168.137.130:3306/works"
-
val username = "root"
-
val password = "123456"
-
-
var connection : Connection = null
-
var query = ""
-
-
try{
-
//Class.forName(driver)
-
connection = DriverManager.getConnection(url, username, password)
-
val ps = connection.createStatement()
-
-
for ((key, value) <- wc) {
-
query="insert into wordcounts(words, counts) values('"+key+"' , '"+value+"')"
-
ps.addBatch(query)
-
}
-
-
ps.executeBatch()
-
} catch {
-
case e => e.printStackTrace
-
} finally {
-
connection.close()
-
}
-
}
- }
點選(此處)摺疊或開啟
- create table wordcounts(words varchar(50), counts varchar(20));
點選(此處)摺疊或開啟
-
<project xmlns="" xmlns:xsi="" xsi:schemaLocation=" ">
-
<modelVersion>4.0.0</modelVersion>
-
<groupId>testScala</groupId>
-
<artifactId>testScala</artifactId>
-
<version>1.0</version>
-
<inceptionYear>2008</inceptionYear>
-
<properties>
-
<scala.version>2.11.8</scala.version>
-
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
-
<spark.version>2.2.1</spark.version>
-
<mysql.version>5.1.25</mysql.version>
-
</properties>
-
-
<repositories>
-
<repository>
-
<id>scala-tools.org</id>
-
<name>Scala-Tools Maven2 Repository</name>
-
<url>http://scala-tools.org/repo-releases</url>
-
</repository>
-
-
<repository>
-
<id>cloudera</id>
-
<name>cloudera</name>
-
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
-
</repository>
-
</repositories>
-
-
<pluginRepositories>
-
<pluginRepository>
-
<id>scala-tools.org</id>
-
<name>Scala-Tools Maven2 Repository</name>
-
<url>http://scala-tools.org/repo-releases</url>
-
</pluginRepository>
-
</pluginRepositories>
-
-
<dependencies>
-
<dependency>
-
<groupId>org.apache.spark</groupId>
-
<artifactId>spark-core_2.11</artifactId>
-
<version>${spark.version}</version>
-
</dependency>
-
<dependency>
-
<groupId>org.scala-lang</groupId>
-
<artifactId>scala-library</artifactId>
-
<version>${scala.version}</version>
-
</dependency>
-
<dependency>
-
<groupId>junit</groupId>
-
<artifactId>junit</artifactId>
-
<version>4.4</version>
-
<scope>test</scope>
-
</dependency>
-
<dependency>
-
<groupId>org.specs</groupId>
-
<artifactId>specs</artifactId>
-
<version>1.2.5</version>
-
<scope>test</scope>
-
</dependency>
-
-
<dependency>
-
<groupId>org.apache.hadoop</groupId>
-
<artifactId>hadoop-client</artifactId>
-
<version>${hadoop.version}</version>
-
</dependency>
-
-
<dependency>
-
<groupId>mysql</groupId>
-
<artifactId>mysql-connector-java</artifactId>
-
<version>${mysql.version}</version>
-
</dependency>
-
</dependencies>
-
-
<build>
-
<sourceDirectory>src/main/scala</sourceDirectory>
-
<testSourceDirectory>src/test/scala</testSourceDirectory>
-
<plugins>
-
<plugin>
-
<groupId>org.scala-tools</groupId>
-
<artifactId>maven-scala-plugin</artifactId>
-
<executions>
-
<execution>
-
<goals>
-
<goal>compile</goal>
-
<goal>testCompile</goal>
-
</goals>
-
</execution>
-
</executions>
-
<configuration>
-
<scalaVersion>${scala.version}</scalaVersion>
-
<args>
-
<arg>-target:jvm-1.5</arg>
-
</args>
-
</configuration>
-
</plugin>
-
<plugin>
-
<groupId>org.apache.maven.plugins</groupId>
-
<artifactId>maven-eclipse-plugin</artifactId>
-
<configuration>
-
<downloadSources>true</downloadSources>
-
<buildcommands>
-
<buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
-
</buildcommands>
-
<additionalProjectnatures>
-
<projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
-
</additionalProjectnatures>
-
<classpathContainers>
-
<classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
-
<classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
-
</classpathContainers>
-
</configuration>
-
</plugin>
-
</plugins>
-
</build>
-
<reporting>
-
<plugins>
-
<plugin>
-
<groupId>org.scala-tools</groupId>
-
<artifactId>maven-scala-plugin</artifactId>
-
<configuration>
-
<scalaVersion>${scala.version}</scalaVersion>
-
</configuration>
-
</plugin>
-
</plugins>
-
</reporting>
- </project>
hdfs dfs -ls /ruozedata/scala
Found 3 items
-rw-r--r-- 3 hadoop supergroup 24 2018-02-12 14:48 /ruozedata/scala/201802121449-1.txt
-rw-r--r-- 3 hadoop supergroup 12 2018-02-12 14:48 /ruozedata/scala/201802121449-2.txt
-rw-r--r-- 3 hadoop supergroup 12 2018-02-12 14:48 /ruozedata/scala/201802121449-3.txt
-----------------------------------------------------------------------------------------------------------
hdfs dfs -cat /ruozedata/scala/201802121449-1.txt
hello world
hello worls
hdfs dfs -cat /ruozedata/scala/201802121449-2.txt
hello ruoze
hdfs dfs -cat /ruozedata/scala/201802121449-3.txt
ruoze hello
七、測試結果
注:後續大家有什麼好的建議,請提出。
來自@若澤大資料
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/31511218/viewspace-2151064/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- 從json物件中讀取資料存入表格JSON物件
- 使用openpyxl庫讀取Excel檔案資料Excel
- 配置檔案讀取——MySQL 多個連線MySql
- vb向資料庫中讀取單個圖片檔案資料庫
- printf重寫,可存入檔案、也可存入快取buff快取
- 如何建立最簡單的 ABAP 資料庫表,以及編碼從資料庫表中讀取資料 (上)資料庫
- Scala檔案的讀寫操作
- CSV Data Set Config 引數化怎麼從多個檔案中讀取資料?
- 多種方式讀取 MySQL 資料庫配置MySql資料庫
- 讀取JSON資料存入表格JSON
- 使用yaml檔案讀取資料YAML
- (線上等)請問怎樣才能將WAV聲音檔案存入MYSQL資料庫中?MySql資料庫
- EasyExcel庫來讀取指定Excel檔案中的資料Excel
- Delphi程式設計:用流來讀取TXT檔案中的資料程式設計
- python——將excel檔案寫入mysql資料庫中PythonExcelMySql資料庫
- android直接讀取資料庫檔案Android資料庫
- Hibernate 讀取檔案到資料庫的一個bug資料庫
- Kettle 從資料庫讀取資料存到變數中資料庫變數
- Java讀取properties檔案連線資料庫Java資料庫
- 使用RxJava從多個資料來源獲取資料RxJava
- 讀取資料夾檔案
- c# 讀取多個路徑檔案到一個檔案
- ES 實現實時從Mysql資料庫中讀取熱詞,停用詞MySql資料庫
- Rust 程式設計,讀取檔案Rust程式設計
- MySql資料庫——檔案MySql資料庫
- 計算機程式的思維邏輯 (60) - 隨機讀寫檔案及其應用 - 實現一個簡單的KV資料庫計算機隨機資料庫
- 從資料庫Mysql讀取資料轉化為xml文件的Java類資料庫MySqlXMLJava
- Storm的wordcount程式碼編寫與分析ORM
- 使用SQL語句從資料庫一個表中隨機獲取資料SQL資料庫隨機
- ASP.NET Core 中的檔案提供程式 遍歷資料夾讀取檔案資訊ASP.NET
- php 讀取excel中的內容到mysql 資料庫PHPExcelMySql資料庫
- 資料庫課程作業筆記 - 編寫資料庫遷移檔案資料庫筆記
- Sql server:從XML檔案中讀取配置資訊SQLServerXML
- pg從磁碟讀取檔案
- .NET 5/.NET Core使用EF Core 5連線MySQL資料庫寫入/讀取資料示例教程MySql資料庫
- 從網路裡面獲取資料寫入到檔案
- 入門計劃->使用(C++庫)ofstream寫檔案資料 (轉)C++
- 入門計劃->使用(C++庫)ifstream讀檔案資料 (轉)C++