python對資料集進行清洗與視覺化

專注的阿熊發表於2021-03-25

import os

import easygui as g

import glob

import pandas as pd

import xml.etree.ElementTree as ET

from tqdm import tqdm

import pandas_profiling

image_path = g.diropenbox( title= " 請選擇影像資料夾路徑 ",default=r"E:\python\ 標定資料清洗 \00001001-00001500image")# default 按照自己資料的位置設定,可以減輕繁瑣操作

print(image_path)

xml_path = g.diropenbox( title= " 請選擇 xml 資料夾路徑 ",default=r"E:\python\ 標定資料清洗 \00001001-00001500xml")

print(xml_path)

image_lst = os.listdir(image_path)

xml_lst = os.listdir(xml_path)

print("image list:", len(image_lst))

print("xml list:", len(xml_lst))

print(" ————————功能 1 :顯示命名不規劃的 xml 檔案——————————————————— ")

err_xml=[]

# 顯示命名不規劃的 xml 檔案

for xml in xml_lst:

    if len(xml)!=12:# 自己定義自己的命名規範格式

        print(xml)

        err_xml.append(xml)

if len(err_xml)==0:

    print(" 無不規範命名的 xml 檔案 ")

print(" ————————功能 2 :缺失 xml 檔案顯示—————————————————————————— ")

# 缺失 xml 檔案顯示

missing_xml = []

for image in tqdm(image_lst):

    xml = image[:-4] + '.xml'

    if xml not in xml_lst:

        missing_xml.append(xml[:-4])

print(" 缺失 xml 檔案數: ",len(missing_xml))

print(" 缺失 xml 檔案為: ",missing_xml)

print(" ————————功能 3 :缺失影像顯示————————————————————————————— ")

# 缺失影像顯示(或者說多餘的 xml

missing_image = []

for xml in tqdm(xml_lst):

    image = xml[:-4] + '.jpg'

    if image not in image_lst:

        missing_image.append(xml[:-4])

print(" 缺失 image 檔案數: ", len(missing_image))

print(" 缺失 image 檔案為: ", missing_image)

print(" ————————功能 4 :刪除沒有對應 xml 的圖片————————————————————— ")

drop_list1=[]

while len(missing_xml):

    for index1 in missing_xml:

        image = index1 + '.jpg'

        os.remove(image_path + "\\"  + image)

        missing_xml.remove(index1)

        drop_list1.append(index1)

if len(drop_list1)>0:

    print(" 成功刪除: ",drop_list1)

else:

    print(" 無缺失檔案 ")

print(" ————————功能 5 :刪除沒有對應圖片的 xml 檔案—————————————————— ")

drop_list2=[]

while len(missing_image):

    for index2 in missing_image:

        xml = index2 + '.xml'

        os.remove(xml_path + "\\" + xml)

        missing_image.remove(index2)

        drop_list2.append(index2)

if len(drop_list2)>0:

    print(" 成功刪除: ",drop_list2)

else:

    print(" 無缺失檔案 ")

print(" ————————功能 6 :將 xml 檔案寫入 csv 檔案—————————————————————— ")

# xml 檔案寫入 csv 檔案,方便後期資料分析

def xml_to_csv(path):

    xml_list = []

    for xml_file in glob.glob(path + "\\" + '*.xml'):

        # print(xml_file)

        tree = ET.parse(xml_file)

        root = tree.getroot()

        for member in root.findall('object'):

            value = (root.find('filename').text,

                     int(root.find('size')[0].text),

                     int(root.find('size')[1].text),

                     member[0].text,

                     int(member[4][0].text),

                     int(member[4][1].text),

                     int(member[4][2].text),

                     int(member[4][3].text)

                     )

            xml_list.append(value)

    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']

    xml_df = pd.DataFrame(xml_list, columns=column_name)

    return xml_df

xml_df = xml_to_csv(xml_path)

xml_df.to_csv('labels.csv', index=None)

print('Successfully converted xml to csv.')

print(" ————————外匯跟單gendan5.com功能 7 :檢視 xml 檔案資訊,生成報告——————————————————— ")

def eda(in_file, out_file):

    data = pd.read_csv(in_file, sep=',')

    pfr = pandas_profiling.ProfileReport(data)

    pfr.to_file(out_file)

in_file = 'labels.csv'

out_file = 'labels.html'

eda(in_file, out_file)

print('eda done!')

print(" ————————功能 8 :改寫 label 出錯的 xml 檔案———————————————————— ")

def main(path):

    wrong_class_lst1, wrong_class_lst2, w_lst = [], [], []

    for xml_file in glob.glob(path + '*.xml'):

        print(xml_file)

        tree = ET.parse(xml_file)

        root = tree.getroot()

        for member in root.findall('object'):

            value = member[0].text

            if value == 'chemical_vehical' or value == 'chemcial_vehicle' or value == 'chemical_vehicel':

                wrong_class_lst1.append(root.find('filename').text)

                member[0].text = 'chemical_vehicle'

            if value == 'chemical_sigh':

                wrong_class_lst2.append(root.find('filename').text)

                member[0].text = 'chemical_sign'

            if value == 'w':

                w_lst.append(root.find('filename').text)

        tree.write(xml_file)

    print('wrong_class_list1:', wrong_class_lst1)

    print('wrong_class_list2:', wrong_class_lst1)

    print('w_list:', w_lst)

main(xml_path)

print(" 完成! ")


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2764944/,如需轉載,請註明出處,否則將追究法律責任。

相關文章