import os

import easygui as g

import glob

import pandas as pd

import xml.etree.ElementTree as ET

from tqdm import tqdm

import pandas_profiling

image_path = g.diropenbox( title= " 請選擇影像資料夾路徑 ",default=r"E:\python\ 標定資料清洗 \00001001-00001500image")# 將 default 按照自己資料的位置設定，可以減輕繁瑣操作

print(image_path)

xml_path = g.diropenbox( title= " 請選擇 xml 資料夾路徑 ",default=r"E:\python\ 標定資料清洗 \00001001-00001500xml")

print(xml_path)

image_lst = os.listdir(image_path)

xml_lst = os.listdir(xml_path)

print("image list:", len(image_lst))

print("xml list:", len(xml_lst))

print(" ————————功能 1 ：顯示命名不規劃的 xml 檔案——————————————————— ")

err_xml=[]

# 顯示命名不規劃的 xml 檔案

for xml in xml_lst:

if len(xml)!=12:# 自己定義自己的命名規範格式

print(xml)

err_xml.append(xml)

if len(err_xml)==0:

print(" 無不規範命名的 xml 檔案 ")

print(" ————————功能 2 ：缺失 xml 檔案顯示—————————————————————————— ")

# 缺失 xml 檔案顯示

missing_xml = []

for image in tqdm(image_lst):

xml = image[:-4] + '.xml'

if xml not in xml_lst:

missing_xml.append(xml[:-4])

print(" 缺失 xml 檔案數： ",len(missing_xml))

print(" 缺失 xml 檔案為： ",missing_xml)

print(" ————————功能 3 ：缺失影像顯示————————————————————————————— ")

# 缺失影像顯示（或者說多餘的 xml ）

missing_image = []

for xml in tqdm(xml_lst):

image = xml[:-4] + '.jpg'

if image not in image_lst:

missing_image.append(xml[:-4])

print(" 缺失 image 檔案數： ", len(missing_image))

print(" 缺失 image 檔案為： ", missing_image)

print(" ————————功能 4 ：刪除沒有對應 xml 的圖片————————————————————— ")

drop_list1=[]

while len(missing_xml):

for index1 in missing_xml:

image = index1 + '.jpg'

os.remove(image_path + "\\" + image)

missing_xml.remove(index1)

drop_list1.append(index1)

if len(drop_list1)>0:

print(" 成功刪除： ",drop_list1)

else:

print(" 無缺失檔案 ")

print(" ————————功能 5 ：刪除沒有對應圖片的 xml 檔案—————————————————— ")

drop_list2=[]

while len(missing_image):

for index2 in missing_image:

xml = index2 + '.xml'

os.remove(xml_path + "\\" + xml)

missing_image.remove(index2)

drop_list2.append(index2)

if len(drop_list2)>0:

print(" 成功刪除： ",drop_list2)

else:

print(" 無缺失檔案 ")

print(" ————————功能 6 ：將 xml 檔案寫入 csv 檔案—————————————————————— ")

# 將 xml 檔案寫入 csv 檔案，方便後期資料分析

def xml_to_csv(path):

xml_list = []

for xml_file in glob.glob(path + "\\" + '*.xml'):

# print(xml_file)

tree = ET.parse(xml_file)

root = tree.getroot()

for member in root.findall('object'):

value = (root.find('filename').text,

int(root.find('size')[0].text),

int(root.find('size')[1].text),

member[0].text,

int(member[4][0].text),

int(member[4][1].text),

int(member[4][2].text),

int(member[4][3].text)

)

xml_list.append(value)

column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']

xml_df = pd.DataFrame(xml_list, columns=column_name)

return xml_df

xml_df = xml_to_csv(xml_path)

xml_df.to_csv('labels.csv', index=None)

print('Successfully converted xml to csv.')

print(" ————————外匯跟單gendan5.com功能 7 ：檢視 xml 檔案資訊，生成報告——————————————————— ")

def eda(in_file, out_file):

data = pd.read_csv(in_file, sep=',')

pfr = pandas_profiling.ProfileReport(data)

pfr.to_file(out_file)

in_file = 'labels.csv'

out_file = 'labels.html'

eda(in_file, out_file)

print('eda done!')

print(" ————————功能 8 ：改寫 label 出錯的 xml 檔案———————————————————— ")

def main(path):

wrong_class_lst1, wrong_class_lst2, w_lst = [], [], []

for xml_file in glob.glob(path + '*.xml'):

print(xml_file)

tree = ET.parse(xml_file)

root = tree.getroot()

for member in root.findall('object'):

value = member[0].text

if value == 'chemical_vehical' or value == 'chemcial_vehicle' or value == 'chemical_vehicel':

wrong_class_lst1.append(root.find('filename').text)

member[0].text = 'chemical_vehicle'

if value == 'chemical_sigh':

wrong_class_lst2.append(root.find('filename').text)

member[0].text = 'chemical_sign'

if value == 'w':

w_lst.append(root.find('filename').text)

tree.write(xml_file)

print('wrong_class_list1:', wrong_class_lst1)

print('wrong_class_list2:', wrong_class_lst1)

print('w_list:', w_lst)

main(xml_path)

print(" 完成！ ")

python對資料集進行清洗與視覺化

相關文章