sqoop小作業

13545163656發表於2018-06-21
實現需求
1) city_info表存在MySQL 
2) product_info表存在MySQL
3) user_click資料匯入Hive
4) 三表的join 取TOP3(按區域進行分組)按天分割槽表
5) 查詢結果匯入到MySQL資料庫

最終的統計結果欄位如下:
product_id 商品ID
product_name 商品名稱
area   區域
click_count  點選數/訪問量
rank   排名
day   時間

-------------------------------------------------------------------------------------

一、建立ruozedatajob資料庫
create database ruozedata_job;


二、hive建立使用者點選表
drop table user_click;
create  table user_click(
user_id int,
session_id string,
action_time string,
city_id int,
product_id int
)
partitioned by (data string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';

-- load 使用者點選量表到使用者點選表
load data local inpath '/home/hadoop/data/user_click.txt' overwrite into table user_click partition(data='2018-06-20');


三、匯入城市資訊表到hive
-- hive建立城市資訊表
drop table city_info;
create  table city_info
(
city_id int,
city_name string,
area string
)

-- MySQL匯入Hive
sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--table city_info -m 1 \
--mapreduce-job-name Hive_city_info \
--delete-target-dir \
--hive-database ruozedata_job \
--hive-table city_info \
--hive-overwrite \
--hive-import

四、匯入產品資訊表到hive

-- hive建立產品資訊表
drop table product_info;
create external table product_info(
product_id int,
product_name string,
extend_info string
);

-- MySQL匯入Hive
sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--table product_info -m 1 \
--mapreduce-job-name Hive_product_info \
--delete-target-dir \
--hive-database ruozedata_job \
--hive-table product_info \
--hive-import \
--hive-overwrite


五、最終的統計結果欄位如下:
product_id 商品ID
product_name 商品名稱
area   區域
click_count   點選數/訪問量
rank   排名
day   時間

-- SQL語句
use ruozedata_job;

  1. SELECT t2.*,current_date as day from
  2.   (
  3.       SELECT
  4.         t1.product_id,
  5.         t1.product_name,
  6.         t1.area,
  7.         t1.click_count,
  8.         row_number() over(PARTITION BY t1.AREA ORDER BY t1.click_count DESC) AS rank
  9.         FROM
  10.           (
  11.             SELECT
  12.               t.product_id ,
  13.               t.product_name,
  14.               t.area,
  15.               COUNT(t.session_id) click_count
  16.               FROM (
  17.                   SELECT
  18.                     uc.session_id,
  19.                     ci.area,
  20.                     pi.product_id,
  21.                     pi.product_name
  22.                   FROM user_click uc
  23.                     LEFT JOIN city_info ci ON uc.city_id=ci.city_id
  24.                     LEFT JOIN product_info pi ON uc.product_id=pi.product_id
  25.                     WHERE 1=1
  26.                     AND pi.product_name IS NOT NULL
  27.                     AND ci.area IS NOT NULL
  28.                     )t
  29.               GROUP BY t.area,t.product_name,t.product_id
  30.               ORDER BY t.AREA ,click_count DESC
  31.           )t1
  32.    )t2
  33.   WHERE t2.rank <=3
  
統計結果:
Total MapReduce CPU Time Spent: 7 seconds 400 msec
OK
7       product7        CC      39      1       2018-06-21
26      product26       CC      39      2       2018-06-21
70      product70       CC      38      3       2018-06-21
4       product4        EC      40      1       2018-06-21
96      product96       EC      32      2       2018-06-21
5       product5        EC      31      3       2018-06-21
40      product40       NC      16      1       2018-06-21
9       product9        NC      16      2       2018-06-21
5       product5        NC      13      3       2018-06-21
56      product56       NW      20      1       2018-06-21
67      product67       NW      20      2       2018-06-21
48      product48       NW      19      3       2018-06-21
38      product38       SC      35      1       2018-06-21
98      product98       SC      34      2       2018-06-21
88      product88       SC      34      3       2018-06-21
16      product16       SW      20      1       2018-06-21
60      product60       SW      19      2       2018-06-21
95      product95       SW      19      3       2018-06-21
Time taken: 57.804 seconds, Fetched: 18 row(s)




六、Hive建立複製表及表資料

  1. create table product_hot as
  2. SELECT t2.*,current_date as day from
  3.   (
  4.       SELECT
  5.         t1.product_id,
  6.         t1.product_name,
  7.         t1.area,
  8.         t1.click_count,
  9.         row_number() over(PARTITION BY t1.AREA ORDER BY t1.click_count DESC) AS rank
  10.         FROM
  11.           (
  12.             SELECT
  13.               t.product_id ,
  14.               t.product_name,
  15.               t.area,
  16.               COUNT(t.session_id) click_count
  17.               FROM (
  18.                   SELECT
  19.                     uc.session_id,
  20.                     ci.area,
  21.                     pi.product_id,
  22.                     pi.product_name
  23.                   FROM user_click uc
  24.                     LEFT JOIN city_info ci ON uc.city_id=ci.city_id
  25.                     LEFT JOIN product_info pi ON uc.product_id=pi.product_id
  26.                     WHERE 1=1
  27.                     AND pi.product_name IS NOT NULL
  28.                     AND ci.area IS NOT NULL
  29.                     )t
  30.               GROUP BY t.area,t.product_name,t.product_id
  31.               ORDER BY t.AREA ,click_count DESC
  32.           )t1
  33.    )t2
  34.   WHERE t2.rank <=3



  
七、HIve匯入到MySQL

-- MySQL建立表結構
CREATE TABLE product_hot
(
product_id  int ,
product_name  varchar(255),
area  varchar(255),
click_count  int,
rank int,
day varchar(255)
)

-- 匯入MySQL資料庫
sqoop export \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--mapreduce-job-name FromHDFSToMySQL3 \
--table product_hot \
-m 2 \
--export-dir /user/hive/warehouse/ruozedata_job.db/product_hot/* \
--fields-terminated-by '\001'


MySQL檢視結果:



來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/31441024/viewspace-2156457/,如需轉載,請註明出處,否則將追究法律責任。

相關文章