最近學了一個比較讚的電商專案,專案作者提供了完整的示例資料,包括商品資訊及配圖,但是這些配圖是固定的URL,商品詳情為html,html中有img標籤,img標籤中也有url。根據過往經驗這種線上CDN很容易掛掉,因此產生了把商品資料中的商品圖片提取出來,放在自己的騰訊雲伺服器中的想法,保證可訪問性。
演示資料
[{
"ID": "b93e59e214fc4478ac72652a2c87fe54",
"GOODS_SERIAL_NUMBER": "2300000059885",
"SHOP_ID": "402880e860166f3c0160167897d60002",
"SUB_ID": "402880e86016d1b5016016dcd7c50004",
"GOOD_TYPE": 1,
"STATE": 0,
"IS_DELETE": 1,
"NAME": "雲南紅提800g/盒",
"ORI_PRICE": 18,
"PRESENT_PRICE": 15,
"AMOUNT": 10000,
"DETAIL": "<img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_9395.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_3391.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_7603.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_4718.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_778.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_2602.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_7913.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_202.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_4296.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_6956.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_8200.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112031_3967.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112031_5114.jpg\" width=\"100%\" height=\"auto\" alt=\"\" />",
"BRIEF": null,
"SALES_COUNT": 0,
"IMAGE1": "http://images.koow.cc/shopGoodsImg/20171225/20171225112020_561.jpg",
"IMAGE2": null,
"IMAGE3": null,
"IMAGE4": null,
"IMAGE5": null,
"ORIGIN_PLACE": null,
"GOOD_SCENT": null,
"CREATE_TIME": 1514172047397,
"UPDATE_TIME": 1522037064430,
"IS_RECOMMEND": 0,
"PICTURE_COMPERSS_PATH": "http://images.koow.cc/compressedPic/20171225112020_561.jpg"
},
{
"ID": "e0ab2f6e2802443ba117b1146cf85fee",
"GOODS_SERIAL_NUMBER": "4894375014863",
"SHOP_ID": "402880e860166f3c0160167897d60002",
"SUB_ID": "2c9f6c94609a62be0160a02d1dc20021",
"GOOD_TYPE": 1,
"STATE": 0,
"IS_DELETE": 1,
"NAME": "菓子町園道乳酸菌味夾心餅乾(抹茶味)540/罐",
"ORI_PRICE": 29.8,
"PRESENT_PRICE": 29.8,
"AMOUNT": 10000,
"DETAIL": "<img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110655_230.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_329.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_2659.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_9521.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_8611.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_1390.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_7291.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_3919.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_2170.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_4402.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_1926.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_9438.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_4361.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_2730.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110658_314.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110658_8779.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110658_9878.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110658_3471.jpg\" width=\"100%\" height=\"auto\" alt=\"\" />",
"BRIEF": null,
"SALES_COUNT": 0,
"IMAGE1": "http://images.koow.cc/shopGoodsImg/20180213/20180213110648_2744.jpg",
"IMAGE2": null,
"IMAGE3": null,
"IMAGE4": null,
"IMAGE5": null,
"ORIGIN_PLACE": null,
"GOOD_SCENT": null,
"CREATE_TIME": 1518491222336,
"UPDATE_TIME": 1523174099461,
"IS_RECOMMEND": 0,
"PICTURE_COMPERSS_PATH": "http://images.koow.cc/compressedPic/20180213110648_2744.jpg"
}]
可以看到,資料比較完整,包括ID、編號、名稱、價格、介紹等資訊。
如果想要提取JSON物件中的圖片URL,對於其中的images1-images5物件比較好處理,只需要遍歷即可。對於DETAIL中的圖片URL,由於URL混在html中,沒有辦法直接拿到,可通過正則匹配的形式獲取。下面分步驟操作:
提取IMAGE1-IMAGE5中的圖片URL
const fs = require("fs");
fs.readFile("./goods_demo.json", "utf8", (err, data) => {
// 序列化資料
data = JSON.parse(data);
data.map((value, index) => {
for (let i = 0; i < 5; i++) {
// 遍歷資料,並寫入到名為result.txt的檔案中
if (value[`IMAGE${i + 1}`] !== null) {
const url = value[`IMAGE${i + 1}`]
fs.appendFile("./result.txt",`\r\n${url}`, function(err) {
if (err) console.log("寫檔案操作失敗");
else console.log("寫檔案操作成功");
});
}
}
});
});
使用NodeJS執行上面的程式碼後,就能夠正確的讀取到IMAGE物件中的URL,並寫入到result.txt檔案中。
提取DETAIL物件中的圖片URL
對url地址分析可以發現,圖片URL包括http開頭(part1),CDN的URL(part2),圖片所在的目錄(part3),圖片的名稱(part4):
"http://(part1)images.koow.cc(part2)/shopGoodsImg(part3)/20171225(part3)/20171225112020_561.jpg(part4)"
根據以上正則規則,可以用以下正則進行匹配!
// \w表示任意字母數字或下劃線
// url中的/符號需要轉義
// {2,5}表示出現2-5次
// /g表示全域性匹配
const urlReg = /http\:\/\/images.koow.cc(\/\w+){2,5}\.jpg/g;
加上對JSON中DETAIL物件處理的程式碼以後,整體程式碼如下:
const fs = require("fs");
fs.readFile("./goods_demo.json", "utf8", (err, data) => {
data = JSON.parse(data);
data.map((value, index) => {
if (value.DETAIL) {
// 匹配圖片的正規表示式
const urlReg = /http\:\/\/images.koow.cc(\/\w+){2,5}\.jpg/g;
const arrlist = value.DETAIL.match(urlReg);
// 對匹配到的image list遍歷並寫入檔案
if (arrlist && arrlist.length) {
arrlist.map(item => {
fs.appendFile("./result.txt", `\r\n${item}`, function(err) {
if (err) console.log("寫DETAIL記錄操作失敗");
else console.log("寫DETAIL記錄操作成功");
});
});
}
}
for (let i = 0; i < 5; i++) {
if (value[`IMAGE${i + 1}`] !== null) {
const url = value[`IMAGE${i + 1}`]
fs.appendFile("./result.txt",`\r\n${url}`, function(err) {
if (err) console.log("寫檔案操作失敗");
else console.log("寫檔案操作成功");
});
}
}
});
});
最終提取的url在reuslt.txt中儲存,等待後續的處理。
http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_9395.jpg
http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_3391.jpg
http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_4718.jpg
http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_7603.jpg
……
批量下載
想要做私有的CDN伺服器,檔案的儲存路徑是不能變的,不然就匹配不到資料庫中儲存的路徑。如何在批量下載時保持圖片的目錄不變呢?很簡單,只需要使用wget命令:
wget -nc -r -i ./result.txt
-nc, --no-clobber 不要覆蓋已經存在的檔案
-r, –recursive 遞迴下載,下載所有檔案
-i, --input-file 下載指定檔案中的URL
總結
對JSON或XML資料執行處理是程式設計師的必備技能,掌握高效的資料處理方法能讓工作事半功倍,避免不必要的時間開銷。作者寫本文的目的是希望能幫助到有同樣需求的小夥伴,也希望電腦旁的你能把自己處理資料的技巧分享出來!