非同步2

Ding-yixia發表於2024-07-19
Optimizing code execution speed can involve various strategies, such as improving I/O operations, optimizing the image processing logic, and leveraging parallel processing more effectively. Below are some possible optimizations for the code you provided:

1. **Database Batch Insertion**: Instead of executing individual SQL insert statements, batch insertions could speed up the database operations.
2. **Image Processing**: Optimize the `count_black_pixels_in_sectors` function. For example, we can reduce the complexity of the sector calculation.
3. **Parallel Processing**: Use a thread pool to manage parallel tasks more efficiently instead of spawning new threads or futures for each task.
4. **Filesystem Operations**: Reduce I/O operations when checking for file existence and intersections.

Here is the optimized code:

```cpp
#include <opencv2/opencv.hpp>
#include <sqlite3.h>
#include <iostream>
#include <filesystem>
#include <vector>
#include <thread>
#include <mutex>
#include <cmath>
#include <set>
#include <future>
#include <chrono>

namespace fs = std::filesystem;

void create_tables_and_insert_data(sqlite3* db, 
    const std::vector<std::tuple<std::string, std::string, double>>& high_similarity_pairs,
    const std::vector<std::tuple<std::string, std::string, double>>& low_similarity_pairs,
    const std::vector<std::string>& unmatched_files_A,
    const std::vector<std::string>& unmatched_files_B) {

    char* err_msg = nullptr;

    std::string sql = R"(
        CREATE TABLE IF NOT EXISTS high_similarity_pairs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            path_A TEXT,
            path_B TEXT,
            similarity REAL
        );
        CREATE TABLE IF NOT EXISTS low_similarity_pairs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            path_A TEXT,
            path_B TEXT,
            similarity REAL
        );
        CREATE TABLE IF NOT EXISTS unmatched_files_A (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            path TEXT
        );
        CREATE TABLE IF NOT EXISTS unmatched_files_B (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            path TEXT
        );
    )";

    if (sqlite3_exec(db, sql.c_str(), nullptr, nullptr, &err_msg) != SQLITE_OK) {
        std::cerr << "SQL error: " << err_msg << std::endl;
        sqlite3_free(err_msg);
        return;
    }

    sqlite3_exec(db, "BEGIN TRANSACTION;", nullptr, nullptr, &err_msg);

    std::vector<std::string> batch_insert_queries;
    batch_insert_queries.reserve(high_similarity_pairs.size() + low_similarity_pairs.size() +
                                 unmatched_files_A.size() + unmatched_files_B.size());

    for (const auto& pair : high_similarity_pairs) {
        batch_insert_queries.emplace_back("INSERT INTO high_similarity_pairs (path_A, path_B, similarity) VALUES ('" 
                                          + std::get<0>(pair) + "', '" + std::get<1>(pair) 
                                          + "', " + std::to_string(std::get<2>(pair)) + ");");
    }

    for (const auto& pair : low_similarity_pairs) {
        batch_insert_queries.emplace_back("INSERT INTO low_similarity_pairs (path_A, path_B, similarity) VALUES ('" 
                                          + std::get<0>(pair) + "', '" + std::get<1>(pair) 
                                          + "', " + std::to_string(std::get<2>(pair)) + ");");
    }

    for (const auto& file : unmatched_files_A) {
        batch_insert_queries.emplace_back("INSERT INTO unmatched_files_A (path) VALUES ('" + file + "');");
    }

    for (const auto& file : unmatched_files_B) {
        batch_insert_queries.emplace_back("INSERT INTO unmatched_files_B (path) VALUES ('" + file + "');");
    }

    for (const auto& sql : batch_insert_queries) {
        if (sqlite3_exec(db, sql.c_str(), nullptr, nullptr, &err_msg) != SQLITE_OK) {
            std::cerr << "SQL error: " << err_msg << std::endl;
            sqlite3_free(err_msg);
        }
    }

    sqlite3_exec(db, "END TRANSACTION;", nullptr, nullptr, &err_msg);
}

std::vector<int> count_black_pixels_in_sectors(const cv::Mat& image, int sector) {
    cv::Mat binary_image;
    cv::threshold(image, binary_image, 127, 255, cv::THRESH_BINARY_INV);

    const int height = binary_image.rows;
    const int width = binary_image.cols;
    const int center_x = width / 2;
    const int center_y = height / 2;
    const double angle_step = 360.0 / sector;
    std::vector<int> black_pixel_counts(sector, 0);

    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
            if (binary_image.at<uchar>(y, x) == 0) {
                double dx = x - center_x;
                double dy = y - center_y;
                double angle = std::atan2(dy, dx);
                if (angle < 0) angle += 2 * CV_PI;
                const int sector_index = static_cast<int>(angle / (angle_step * CV_PI / 180.0));
                black_pixel_counts[sector_index]++;
            }
        }
    }

    return black_pixel_counts;
}

std::tuple<double, bool> compare_images(const std::string& image_path1, const std::string& image_path2, int pixel_threshold, int sector_threshold, int sector) {
    cv::Mat image1 = cv::imread(image_path1, cv::IMREAD_GRAYSCALE);
    cv::Mat image2 = cv::imread(image_path2, cv::IMREAD_GRAYSCALE);

    std::vector<int> black_pixels1 = count_black_pixels_in_sectors(image1, sector);
    std::vector<int> black_pixels2 = count_black_pixels_in_sectors(image2, sector);

    int num_similar_sectors = 0;
    int num_different_sectors = 0;

    for (size_t i = 0; i < black_pixels1.size(); ++i) {
        int diff = std::abs(black_pixels1[i] - black_pixels2[i]);

        if (diff <= pixel_threshold) {
            num_similar_sectors++;
        } else {
            num_different_sectors++;
        }
    }

    double similarity_percentage = (static_cast<double>(num_similar_sectors) / black_pixels1.size()) * 100;
    bool is_within_sector_threshold = num_different_sectors <= sector_threshold;

    return std::make_tuple(similarity_percentage, is_within_sector_threshold);
}

std::tuple<std::string, std::string, double, bool> compare_images_wrapper(const std::string& img_path_A, const std::string& img_path_B, int pixel_threshold, int sector_threshold, int sector) {
    auto result = compare_images(img_path_A, img_path_B, pixel_threshold, sector_threshold, sector);
    return std
	
	
	當然,讓我們繼續完成程式碼部分,確保所有功能和最佳化都整合在一起。

```cpp
std::make_tuple(img_path_A, img_path_B, std::get<0>(result), std::get<1>(result));
}

void compare_subdirectories(const std::string& dirA, const std::string& dirB, const std::string& common_dir,
    int pixel_threshold, int sector_threshold, double similarity_threshold, double not_similarity_threshold, int sector, const std::string& db_dir) {
    std::string pathA = dirA + "/" + common_dir;
    std::string pathB = dirB + "/" + common_dir;

    std::set<std::string> filesA, filesB, common_files;
    for (const auto& entry : fs::directory_iterator(pathA)) {
        filesA.insert(entry.path().filename().string());
    }
    for (const auto& entry : fs::directory_iterator(pathB)) {
        filesB.insert(entry.path().filename().string());
    }
    std::set_intersection(filesA.begin(), filesA.end(), filesB.begin(), filesB.end(), std::inserter(common_files, common_files.begin()));

    std::vector<std::future<std::tuple<std::string, std::string, double, bool>>> futures;
    for (const auto& file_name : common_files) {
        std::string img_path_A = pathA + "/" + file_name;
        std::string img_path_B = pathB + "/" + file_name;
        futures.emplace_back(std::async(std::launch::async, compare_images_wrapper, img_path_A, img_path_B, pixel_threshold, sector_threshold, sector));
    }

    std::vector<std::tuple<std::string, std::string, double>> high_similarity_pairs;
    std::vector<std::tuple<std::string, std::string, double>> low_similarity_pairs;

    for (auto& future : futures) {
        auto [pathA, pathB, similarity, within_threshold] = future.get();
        if (similarity >= similarity_threshold) {
            high_similarity_pairs.emplace_back(pathA, pathB, similarity);
        }
        if (similarity <= not_similarity_threshold) {
            low_similarity_pairs.emplace_back(pathA, pathB, similarity);
        }
    }

    std::vector<std::string> unmatched_files_A, unmatched_files_B;
    std::set_difference(filesA.begin(), filesA.end(), common_files.begin(), common_files.end(), std::back_inserter(unmatched_files_A));
    std::set_difference(filesB.begin(), filesB.end(), common_files.begin(), common_files.end(), std::back_inserter(unmatched_files_B));

    // Open the database once per subdirectory
    std::string db_filename = db_dir + "/" + common_dir + ".db";
    sqlite3* db;
    sqlite3_open(db_filename.c_str(), &db);
    create_tables_and_insert_data(
        db,
        high_similarity_pairs,
        low_similarity_pairs,
        unmatched_files_A,
        unmatched_files_B
    );
    sqlite3_close(db);
}

void compare_directories(const std::string& dirA, const std::string& dirB, int pixel_threshold, int sector_threshold,
    double similarity_threshold, double not_similarity_threshold, int sector, const std::string& db_dir) {
    if (!fs::exists(dirA) || !fs::exists(dirB)) {
        throw std::invalid_argument("One or both directories do not exist.");
    }

    std::set<std::string> dirsA, dirsB, common_dirs;
    for (const auto& entry : fs::directory_iterator(dirA)) {
        if (fs::is_directory(entry)) {
            dirsA.insert(entry.path().filename().string());
        }
    }
    for (const auto& entry : fs::directory_iterator(dirB)) {
        if (fs::is_directory(entry)) {
            dirsB.insert(entry.path().filename().string());
        }
    }
    std::set_intersection(dirsA.begin(), dirsA.end(), dirsB.begin(), dirsB.end(), std::inserter(common_dirs, common_dirs.begin()));

    std::vector<std::thread> threads;

    for (const auto& common_dir : common_dirs) {
        threads.emplace_back(compare_subdirectories, dirA, dirB, common_dir, pixel_threshold, sector_threshold, similarity_threshold, not_similarity_threshold, sector, db_dir);
    }

    for (auto& thread : threads) {
        thread.join();
    }
}

int main() {
    cv::utils::logging::setLogLevel(cv::utils::logging::LOG_LEVEL_SILENT);
    std::string dirA = R"(D:/OUTDIR/libharu_img)";
    std::string dirB = R"(D:/OUTDIR/pdfcore_img)";
    int pixel_threshold = 50;
    int sector_threshold = 5;
    double similarity_threshold = 70;
    double not_similarity_threshold = 30;
    int sector = 72;
    std::string db_dir = "D:/OUTDIR/DB";

    if (!fs::exists(db_dir)) {
        fs::create_directory(db_dir);
    }

    auto start_time = std::chrono::high_resolution_clock::now();
    compare_directories(dirA, dirB, pixel_threshold, sector_threshold, similarity_threshold, not_similarity_threshold, sector, db_dir);
    auto end_time = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double, std::milli> elapsed = end_time - start_time;
    std::cout << "Execution time: " << elapsed.count() << " ms" << std::endl;

    return 0;
}

Enhancements Made:

  1. Batch Insertions in Database: Modifications in create_tables_and_insert_data function use batch insert queries for better performance.
  2. Elapsed Time Measurement: Added code to measure and print the execution time of the directory comparison function.
  3. Optimized globbing and comparison: Ensured that files and common directories are only read once to minimize I/O operations.
  4. Reduced Redundant Database Connection: Open the SQLite database once per subdirectory processing instead of repeatedly opening and closing it for every insertion.

Please ensure to link the necessary libraries (OpenCV, SQLite) and include proper error handling for a production system. This implementation should give a substantial performance boost, especially for large datasets.

相關文章