畢業設計三:資料清洗

xyzlotus發表於2008-05-11

畢業設計三:資料清洗與轉換

 

1、執行清洗工作

// CleanSQL.txt

delete from weblog where status like'4%'or status like'5%';

delete from weblog where method like'%POST%';

delete from weblog where method like'%OPTIONS%';

delete from weblog where method like'%HEAD%';

delete from weblog where req_bytes like'-';

delete from weblog where url like'%gif%';

delete from weblog where url like'%jpg%';

delete from weblog where url like'%JPG';

delete from weblog where url like'%jpeg%';

delete from weblog where url like'%png';

delete from weblog where url like'%images%';

delete from weblog where url like'%image%';

delete from weblog where url like'%swf%';

delete from weblog where url like'%css%';

delete from weblog where url like'%ccs.css%';

delete from weblog where url='/';

delete from weblog where url='%pac%';

delete from weblog where url like'/stat.php?type=%id=%';

delete from weblog where url like'%php%';

delete from weblog where url like'%js';

delete from weblog where url like'%cgi';

delete from weblog where url like'';

delete from weblog where url like'%zip';

delete from weblog where url like'%rar';

delete from weblog where url like'%mp3';

delete from weblog where url like'%wma';

delete from weblog where url like'%wmv';

delete from weblog where url like'%doc';

delete from weblog where url like'%exe';

delete from weblog where url like'%pdf';

delete from weblog where url like'%ppt';

commit;

執行:SQL> start F:\畢業設計\DataPreparation\CleanSQL.txt

清洗結果:

SQL> select count(*) from weblog;

 

  COUNT(*)

----------

     94265

2、進行格式轉換與橫向縮簡

 update weblog

 set access_time=SUBSTR(access_time,2,20);

 comm.it;

 alter table weblog add(temptime date);

 

 update weblog

 set

 temptime=to_date(access_time,'dd/mm/yyyy hh24:mi:ss','NLS_DATE_LANGUAGE=english');

 commit;

 

 alter table weblog drop column access_time;

 

 alter table weblog add(access_time date);

 

 update weblog

 set access_time = temptime;

 

alter table weblog drop column temptime;

 

alter table weblog drop column ident;

alter table weblog drop column authorized_user;

alter table weblog drop column time_zone;

alter table weblog drop column method;

alter table weblog drop column protocol;

 

/**********************註釋開始***********************************/

//1)清洗資料,凡對挖掘結果沒有影響的記錄,全部刪除。

//HTM頁面,ASP頁面,類似圖片,音樂,SWF,指令碼,壓縮包等全部刪除

//狀態為400599的全部刪除

//方法為POST,HEAD,OPTIONS的全部刪除

//請求位元組數為0的全部刪除。

delete from weblog where status like'4%'or status like'5%';

 

delete from weblog where method like'%POST%';

delete from weblog where method like'%OPTIONS%';

delete from weblog where method like'%HEAD%';

 

delete from weblog where req_bytes like'-';

 

delete from weblog where url like'%gif%';

delete from weblog where url like'%jpg%';

delete from weblog where url like'%JPG';

delete from weblog where url like'%jpeg%';

delete from weblog where url like'%png';

delete from weblog where url like'%images%';

delete from weblog where url like'%image%';

 

delete from weblog where url like'%swf%';

delete from weblog where url like'%css%';

delete from weblog where url like'%ccs.css%';

delete from weblog where url='/';

delete from weblog where url='%pac%';

delete from weblog where url like'/stat.php?type=%id=%';

delete from weblog where url like'%php%';

 

delete from weblog where url like'%js';

delete from weblog where url like'%cgi';

delete from weblog where url like'';

 

delete from weblog where url like'%zip';

delete from weblog where url like'%rar';

 

delete from weblog where url like'%mp3';

delete from weblog where url like'%wma';

delete from weblog where url like'%wmv';

delete from weblog where url like'%doc';

delete from weblog where url like'%exe';

delete from weblog where url like'%pdf';

delete from weblog where url like'%ppt';

 

 

2)對錶進行格式轉換和橫向縮簡

//過濾掉時間列的“[”字元

 update weblog

 set access_time=SUBSTR(access_time,2,20);

//將時間由VARCHAR2轉換為DATE型別,便於後期處理。

 

//1、建立一個DATE型別的臨時欄位TEMPTIME

 alter table weblog add(temptime date);

//2、將舊欄位中的VARCHAR2型別轉換為時間格式,儲存在TEMPTIME

update weblog

set

temptime=to_date(access_time,'dd/mm/yyyy hh24:mi:ss','NLS_DATE_LANGUAGE=english');

commit;

//3、刪除舊欄位

 alter table weblog drop column access_time;

//4、增加DATE型別的新欄位

 alter table weblog add(access_time date);

//5、將臨時欄位中的值更新到新增加的DATE型別欄位中

 update weblog

 set access_time = temptime;

//6、刪除臨時欄位

alter table weblog drop column temptime;

 

//對錶進行橫向縮簡,刪除不需要的欄位

alter table weblog drop column ident;

alter table weblog drop column authorized_user;

alter table weblog drop column time_zone;

alter table weblog drop column method;

alter table weblog drop column protocol;

/****************************註釋結束********************************/

 

2、登入SQL*PLUS

輸入如下命令:

start F:\畢業設計\DataPreparation\CleanAndFormatSQL.txt

即可執行文字檔案裡邊的SQL語句。

 

3、資料清洗的結果

SQL> select count(*) from weblog;

 

  COUNT(*)

----------

     94265

 

SQL> desc weblog;

 名稱                                      是否為空? 型別

 ----------------------------------------- -------- -----------------------

 

 CIP                                                VARCHAR2(20)

 URL                                                VARCHAR2(500)

 STATUS                                             VARCHAR2(10)

 REQ_BYTES                                          VARCHAR2(100)

 ACCESS_TIME                                        DATE

來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/12443821/viewspace-265033/,如需轉載,請註明出處,否則將追究法律責任。

相關文章