#include #include #include #include #include #include #include #include #include #include #include "lib/Epub/Epub/hyphenation/HyphenationCommon.h" #include "lib/Epub/Epub/hyphenation/LanguageHyphenator.h" #include "lib/Epub/Epub/hyphenation/LanguageRegistry.h" struct TestCase { std::string word; std::string hyphenated; std::vector expectedPositions; int frequency; }; struct EvaluationResult { int truePositives = 0; int falsePositives = 0; int falseNegatives = 0; double precision = 0.0; double recall = 0.0; double f1Score = 0.0; double weightedScore = 0.0; }; struct LanguageConfig { std::string cliName; std::string testDataFile; const char* primaryTag; }; const std::vector kSupportedLanguages = { {"english", "test/hyphenation_eval/resources/english_hyphenation_tests.txt", "en"}, {"french", "test/hyphenation_eval/resources/french_hyphenation_tests.txt", "fr"}, {"german", "test/hyphenation_eval/resources/german_hyphenation_tests.txt", "de"}, {"russian", "test/hyphenation_eval/resources/russian_hyphenation_tests.txt", "ru"}, }; std::vector expectedPositionsFromAnnotatedWord(const std::string& annotated) { std::vector positions; const unsigned char* ptr = reinterpret_cast(annotated.c_str()); size_t codepointIndex = 0; while (*ptr != 0) { if (*ptr == '=') { positions.push_back(codepointIndex); ++ptr; continue; } utf8NextCodepoint(&ptr); ++codepointIndex; } return positions; } std::vector loadTestData(const std::string& filename) { std::vector testCases; std::ifstream file(filename); if (!file.is_open()) { std::cerr << "Error: Could not open file " << filename << std::endl; return testCases; } std::string line; while (std::getline(file, line)) { if (line.empty() || line[0] == '#') { continue; } std::istringstream iss(line); std::string word, hyphenated, freqStr; if (std::getline(iss, word, '|') && std::getline(iss, hyphenated, '|') && std::getline(iss, freqStr, '|')) { TestCase testCase; testCase.word = word; testCase.hyphenated = hyphenated; testCase.frequency = std::stoi(freqStr); testCase.expectedPositions = expectedPositionsFromAnnotatedWord(hyphenated); testCases.push_back(testCase); } } file.close(); return testCases; } std::string positionsToHyphenated(const std::string& word, const std::vector& positions) { std::string result; std::vector sortedPositions = positions; std::sort(sortedPositions.begin(), sortedPositions.end()); const unsigned char* ptr = reinterpret_cast(word.c_str()); size_t codepointIndex = 0; size_t posIdx = 0; while (*ptr != 0) { while (posIdx < sortedPositions.size() && sortedPositions[posIdx] == codepointIndex) { result.push_back('='); ++posIdx; } const unsigned char* current = ptr; utf8NextCodepoint(&ptr); result.append(reinterpret_cast(current), reinterpret_cast(ptr)); ++codepointIndex; } while (posIdx < sortedPositions.size() && sortedPositions[posIdx] == codepointIndex) { result.push_back('='); ++posIdx; } return result; } std::vector hyphenateWordWithHyphenator(const std::string& word, const LanguageHyphenator& hyphenator) { auto cps = collectCodepoints(word); trimSurroundingPunctuationAndFootnote(cps); return hyphenator.breakIndexes(cps); } std::vector resolveLanguages(const std::string& selection) { if (selection == "all") { return kSupportedLanguages; } for (const auto& config : kSupportedLanguages) { if (config.cliName == selection) { return {config}; } } return {}; } EvaluationResult evaluateWord(const TestCase& testCase, std::function(const std::string&)> hyphenateFunc) { EvaluationResult result; std::vector actualPositions = hyphenateFunc(testCase.word); std::vector expected = testCase.expectedPositions; std::vector actual = actualPositions; std::sort(expected.begin(), expected.end()); std::sort(actual.begin(), actual.end()); for (size_t pos : actual) { if (std::find(expected.begin(), expected.end(), pos) != expected.end()) { result.truePositives++; } else { result.falsePositives++; } } for (size_t pos : expected) { if (std::find(actual.begin(), actual.end(), pos) == actual.end()) { result.falseNegatives++; } } if (result.truePositives + result.falsePositives > 0) { result.precision = static_cast(result.truePositives) / (result.truePositives + result.falsePositives); } if (result.truePositives + result.falseNegatives > 0) { result.recall = static_cast(result.truePositives) / (result.truePositives + result.falseNegatives); } if (result.precision + result.recall > 0) { result.f1Score = 2 * result.precision * result.recall / (result.precision + result.recall); } // Treat words that contain no hyphenation marks in both the expected data and the // algorithmic output as perfect matches so they don't drag down the per-word averages. if (expected.empty() && actual.empty()) { result.precision = 1.0; result.recall = 1.0; result.f1Score = 1.0; } double fpPenalty = 2.0; double fnPenalty = 1.0; int totalErrors = result.falsePositives * fpPenalty + result.falseNegatives * fnPenalty; int totalPossible = expected.size() * fpPenalty; if (totalPossible > 0) { result.weightedScore = 1.0 - (static_cast(totalErrors) / totalPossible); result.weightedScore = std::max(0.0, result.weightedScore); } else if (result.falsePositives == 0) { result.weightedScore = 1.0; } return result; } void printResults(const std::string& language, const std::vector& testCases, const std::vector>& worstCases, int perfectMatches, int partialMatches, int completeMisses, double totalPrecision, double totalRecall, double totalF1, double totalWeighted, int totalTP, int totalFP, int totalFN, std::function(const std::string&)> hyphenateFunc) { std::string lang_upper = language; if (!lang_upper.empty()) { lang_upper[0] = std::toupper(lang_upper[0]); } std::cout << "================================================================================" << std::endl; std::cout << lang_upper << " HYPHENATION EVALUATION RESULTS" << std::endl; std::cout << "================================================================================" << std::endl; std::cout << std::endl; std::cout << "Total test cases: " << testCases.size() << std::endl; std::cout << "Perfect matches: " << perfectMatches << " (" << (perfectMatches * 100.0 / testCases.size()) << "%)" << std::endl; std::cout << "Partial matches: " << partialMatches << std::endl; std::cout << "Complete misses: " << completeMisses << std::endl; std::cout << std::endl; std::cout << "--- Overall Metrics (averaged per word) ---" << std::endl; std::cout << "Average Precision: " << (totalPrecision / testCases.size() * 100.0) << "%" << std::endl; std::cout << "Average Recall: " << (totalRecall / testCases.size() * 100.0) << "%" << std::endl; std::cout << "Average F1 Score: " << (totalF1 / testCases.size() * 100.0) << "%" << std::endl; std::cout << "Average Weighted Score: " << (totalWeighted / testCases.size() * 100.0) << "% (FP penalty: 2x)" << std::endl; std::cout << std::endl; std::cout << "--- Overall Metrics (total counts) ---" << std::endl; std::cout << "True Positives: " << totalTP << std::endl; std::cout << "False Positives: " << totalFP << " (incorrect hyphenation points)" << std::endl; std::cout << "False Negatives: " << totalFN << " (missed hyphenation points)" << std::endl; double overallPrecision = totalTP + totalFP > 0 ? static_cast(totalTP) / (totalTP + totalFP) : 0.0; double overallRecall = totalTP + totalFN > 0 ? static_cast(totalTP) / (totalTP + totalFN) : 0.0; double overallF1 = overallPrecision + overallRecall > 0 ? 2 * overallPrecision * overallRecall / (overallPrecision + overallRecall) : 0.0; std::cout << "Overall Precision: " << (overallPrecision * 100.0) << "%" << std::endl; std::cout << "Overall Recall: " << (overallRecall * 100.0) << "%" << std::endl; std::cout << "Overall F1 Score: " << (overallF1 * 100.0) << "%" << std::endl; std::cout << std::endl; // Filter out perfect matches from the “worst cases” section so that only actionable failures appear. auto hasImperfection = [](const EvaluationResult& r) { return r.weightedScore < 0.999999; }; std::vector> imperfectCases; imperfectCases.reserve(worstCases.size()); for (const auto& entry : worstCases) { if (hasImperfection(entry.second)) { imperfectCases.push_back(entry); } } std::cout << "--- Worst Cases (lowest weighted scores) ---" << std::endl; int showCount = std::min(10, static_cast(imperfectCases.size())); for (int i = 0; i < showCount; i++) { const auto& testCase = imperfectCases[i].first; const auto& result = imperfectCases[i].second; std::vector actualPositions = hyphenateFunc(testCase.word); std::string actualHyphenated = positionsToHyphenated(testCase.word, actualPositions); std::cout << "Word: " << testCase.word << " (freq: " << testCase.frequency << ")" << std::endl; std::cout << " Expected: " << testCase.hyphenated << std::endl; std::cout << " Got: " << actualHyphenated << std::endl; std::cout << " Precision: " << (result.precision * 100.0) << "%" << " Recall: " << (result.recall * 100.0) << "%" << " F1: " << (result.f1Score * 100.0) << "%" << " Weighted: " << (result.weightedScore * 100.0) << "%" << std::endl; std::cout << " TP: " << result.truePositives << " FP: " << result.falsePositives << " FN: " << result.falseNegatives << std::endl; std::cout << std::endl; } // Additional compact list of the worst ~100 words to aid iteration int compactCount = std::min(100, static_cast(imperfectCases.size())); if (compactCount > 0) { std::cout << "--- Compact Worst Cases (" << compactCount << ") ---" << std::endl; for (int i = 0; i < compactCount; i++) { const auto& testCase = imperfectCases[i].first; std::vector actualPositions = hyphenateFunc(testCase.word); std::string actualHyphenated = positionsToHyphenated(testCase.word, actualPositions); std::cout << testCase.word << " | exp:" << testCase.hyphenated << " | got:" << actualHyphenated << std::endl; } std::cout << std::endl; } } int main(int argc, char* argv[]) { const bool summaryMode = argc <= 1; const std::string languageSelection = summaryMode ? "all" : argv[1]; std::vector languages = resolveLanguages(languageSelection); if (languages.empty()) { std::cerr << "Unknown language: " << languageSelection << std::endl; return 1; } for (const auto& lang : languages) { const auto* hyphenator = getLanguageHyphenatorForPrimaryTag(lang.primaryTag); if (!hyphenator) { std::cerr << "No hyphenator registered for tag: " << lang.primaryTag << std::endl; continue; } const auto hyphenateFunc = [hyphenator](const std::string& word) { return hyphenateWordWithHyphenator(word, *hyphenator); }; if (!summaryMode) { std::cout << "Loading test data from: " << lang.testDataFile << std::endl; } std::vector testCases = loadTestData(lang.testDataFile); if (testCases.empty()) { std::cerr << "No test cases loaded for " << lang.cliName << ". Skipping." << std::endl; continue; } if (!summaryMode) { std::cout << "Loaded " << testCases.size() << " test cases for " << lang.cliName << std::endl; std::cout << std::endl; } int perfectMatches = 0; int partialMatches = 0; int completeMisses = 0; double totalPrecision = 0.0; double totalRecall = 0.0; double totalF1 = 0.0; double totalWeighted = 0.0; int totalTP = 0, totalFP = 0, totalFN = 0; std::vector> worstCases; for (const auto& testCase : testCases) { EvaluationResult result = evaluateWord(testCase, hyphenateFunc); totalTP += result.truePositives; totalFP += result.falsePositives; totalFN += result.falseNegatives; totalPrecision += result.precision; totalRecall += result.recall; totalF1 += result.f1Score; totalWeighted += result.weightedScore; if (result.f1Score == 1.0) { perfectMatches++; } else if (result.f1Score > 0.0) { partialMatches++; } else { completeMisses++; } worstCases.push_back({testCase, result}); } if (summaryMode) { const double averageF1Percent = testCases.empty() ? 0.0 : (totalF1 / testCases.size() * 100.0); std::cout << lang.cliName << ": " << averageF1Percent << "%" << std::endl; continue; } std::sort(worstCases.begin(), worstCases.end(), [](const auto& a, const auto& b) { return a.second.weightedScore < b.second.weightedScore; }); printResults(lang.cliName, testCases, worstCases, perfectMatches, partialMatches, completeMisses, totalPrecision, totalRecall, totalF1, totalWeighted, totalTP, totalFP, totalFN, hyphenateFunc); } return 0; }