6#include <rapidfuzz/details/CharSet.hpp>
14namespace rapidfuzz::fuzz {
20template <
typename InputIt1,
typename InputIt2>
21double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
23 return ratio(detail::Range(first1, last1), detail::Range(first2, last2), score_cutoff);
26template <
typename Sentence1,
typename Sentence2>
27double ratio(
const Sentence1& s1,
const Sentence2& s2,
const double score_cutoff)
29 return indel_normalized_similarity(s1, s2, score_cutoff / 100) * 100;
32template <
typename CharT1>
33template <
typename InputIt2>
34double CachedRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
35 double score_hint)
const
37 return similarity(detail::Range(first2, last2), score_cutoff, score_hint);
40template <
typename CharT1>
41template <
typename Sentence2>
42double CachedRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double score_hint)
const
44 return cached_indel.normalized_similarity(s2, score_cutoff / 100, score_hint / 100) * 100;
51namespace fuzz_detail {
53static constexpr double norm_distance(
size_t dist,
size_t lensum,
double score_cutoff = 0)
56 (lensum > 0) ? (100.0 - 100.0 *
static_cast<double>(dist) /
static_cast<double>(lensum)) : 100.0;
58 return (score >= score_cutoff) ? score : 0;
61static inline size_t score_cutoff_to_distance(
double score_cutoff,
size_t lensum)
63 return static_cast<size_t>(std::ceil(
static_cast<double>(lensum) * (1.0 - score_cutoff / 100)));
66template <
typename InputIt1,
typename InputIt2,
typename CachedCharT1>
68partial_ratio_impl(
const detail::Range<InputIt1>& s1,
const detail::Range<InputIt2>& s2,
69 const CachedRatio<CachedCharT1>& cached_ratio,
70 const detail::CharSet<iter_value_t<InputIt1>>& s1_char_set,
double score_cutoff)
72 ScoreAlignment<double> res;
73 size_t len1 = s1.size();
74 size_t len2 = s2.size();
81 size_t maximum = len1 * 2;
82 double norm_cutoff_sim = rapidfuzz::detail::NormSim_to_NormDist(score_cutoff / 100);
83 size_t cutoff_dist =
static_cast<size_t>(std::ceil(
static_cast<double>(maximum) * norm_cutoff_sim));
84 size_t best_dist = std::numeric_limits<size_t>::max();
85 std::vector<size_t> scores(len2 - len1, std::numeric_limits<size_t>::max());
86 std::vector<std::pair<size_t, size_t>> windows = {{0, len2 - len1 - 1}};
87 std::vector<std::pair<size_t, size_t>> new_windows;
89 while (!windows.empty()) {
90 for (
const auto& window : windows) {
91 auto subseq1_first = s2.begin() +
static_cast<ptrdiff_t
>(window.first);
92 auto subseq2_first = s2.begin() +
static_cast<ptrdiff_t
>(window.second);
93 detail::Range subseq1(subseq1_first, subseq1_first +
static_cast<ptrdiff_t
>(len1));
94 detail::Range subseq2(subseq2_first, subseq2_first +
static_cast<ptrdiff_t
>(len1));
96 if (scores[window.first] == std::numeric_limits<size_t>::max()) {
97 scores[window.first] = cached_ratio.cached_indel.distance(subseq1);
98 if (scores[window.first] < cutoff_dist) {
99 cutoff_dist = best_dist = scores[window.first];
100 res.dest_start = window.first;
101 res.dest_end = window.first + len1;
102 if (best_dist == 0) {
108 if (scores[window.second] == std::numeric_limits<size_t>::max()) {
109 scores[window.second] = cached_ratio.cached_indel.distance(subseq2);
110 if (scores[window.second] < cutoff_dist) {
111 cutoff_dist = best_dist = scores[window.second];
112 res.dest_start = window.second;
113 res.dest_end = window.second + len1;
114 if (best_dist == 0) {
121 size_t cell_diff = window.second - window.first;
122 if (cell_diff == 1)
continue;
125 size_t known_edits = detail::abs_diff(scores[window.first], scores[window.second]);
127 ptrdiff_t min_score =
128 static_cast<ptrdiff_t
>(std::min(scores[window.first], scores[window.second])) -
129 static_cast<ptrdiff_t
>(cell_diff + known_edits / 2);
130 if (min_score <
static_cast<ptrdiff_t
>(cutoff_dist)) {
131 size_t center = cell_diff / 2;
132 new_windows.emplace_back(window.first, window.first + center);
133 new_windows.emplace_back(window.first + center, window.second);
137 std::swap(windows, new_windows);
141 double score = 1.0 - (
static_cast<double>(best_dist) /
static_cast<double>(maximum));
143 if (score >= score_cutoff) score_cutoff = res.score = score;
146 for (
size_t i = 1; i < len1; ++i) {
147 rapidfuzz::detail::Range subseq(s2.begin(), s2.begin() +
static_cast<ptrdiff_t
>(i));
148 if (!s1_char_set.find(subseq.back()))
continue;
150 double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
151 if (ls_ratio > res.score) {
152 score_cutoff = res.score = ls_ratio;
155 if (res.score == 100.0)
return res;
159 for (
size_t i = len2 - len1; i < len2; ++i) {
160 rapidfuzz::detail::Range subseq(s2.begin() +
static_cast<ptrdiff_t
>(i), s2.end());
161 if (!s1_char_set.find(subseq.front()))
continue;
163 double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
164 if (ls_ratio > res.score) {
165 score_cutoff = res.score = ls_ratio;
168 if (res.score == 100.0)
return res;
175template <
typename InputIt1,
typename InputIt2,
typename CharT1 = iter_value_t<InputIt1>>
176ScoreAlignment<double> partial_ratio_impl(
const detail::Range<InputIt1>& s1,
177 const detail::Range<InputIt2>& s2,
double score_cutoff)
179 CachedRatio<CharT1> cached_ratio(s1);
181 detail::CharSet<CharT1> s1_char_set;
183 s1_char_set.insert(ch);
185 return partial_ratio_impl(s1, s2, cached_ratio, s1_char_set, score_cutoff);
190template <
typename InputIt1,
typename InputIt2>
191ScoreAlignment<double> partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2,
192 InputIt2 last2,
double score_cutoff)
194 size_t len1 =
static_cast<size_t>(std::distance(first1, last1));
195 size_t len2 =
static_cast<size_t>(std::distance(first2, last2));
198 ScoreAlignment<double> result = partial_ratio_alignment(first2, last2, first1, last1, score_cutoff);
199 std::swap(result.src_start, result.dest_start);
200 std::swap(result.src_end, result.dest_end);
204 if (score_cutoff > 100)
return ScoreAlignment<double>(0, 0, len1, 0, len1);
207 return ScoreAlignment<double>(
static_cast<double>(len1 == len2) * 100.0, 0, len1, 0, len1);
209 auto s1 = detail::Range(first1, last1);
210 auto s2 = detail::Range(first2, last2);
212 auto alignment = fuzz_detail::partial_ratio_impl(s1, s2, score_cutoff);
213 if (alignment.score != 100 && s1.size() == s2.size()) {
214 score_cutoff = std::max(score_cutoff, alignment.score);
215 auto alignment2 = fuzz_detail::partial_ratio_impl(s2, s1, score_cutoff);
216 if (alignment2.score > alignment.score) {
217 std::swap(alignment2.src_start, alignment2.dest_start);
218 std::swap(alignment2.src_end, alignment2.dest_end);
226template <
typename Sentence1,
typename Sentence2>
227ScoreAlignment<double> partial_ratio_alignment(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
229 return partial_ratio_alignment(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
230 detail::to_end(s2), score_cutoff);
233template <
typename InputIt1,
typename InputIt2>
234double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
236 return partial_ratio_alignment(first1, last1, first2, last2, score_cutoff).score;
239template <
typename Sentence1,
typename Sentence2>
240double partial_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
242 return partial_ratio_alignment(s1, s2, score_cutoff).score;
245template <
typename CharT1>
246template <
typename InputIt1>
247CachedPartialRatio<CharT1>::CachedPartialRatio(InputIt1 first1, InputIt1 last1)
248 : s1(first1, last1), cached_ratio(first1, last1)
250 for (
const auto& ch : s1)
251 s1_char_set.insert(ch);
254template <
typename CharT1>
255template <
typename InputIt2>
256double CachedPartialRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
257 [[maybe_unused]]
double score_hint)
const
259 size_t len1 = s1.size();
260 size_t len2 =
static_cast<size_t>(std::distance(first2, last2));
263 return partial_ratio(detail::to_begin(s1), detail::to_end(s1), first2, last2, score_cutoff);
265 if (score_cutoff > 100)
return 0;
267 if (!len1 || !len2)
return static_cast<double>(len1 == len2) * 100.0;
269 auto s1_ = detail::Range(s1);
270 auto s2 = detail::Range(first2, last2);
272 double score = fuzz_detail::partial_ratio_impl(s1_, s2, cached_ratio, s1_char_set, score_cutoff).score;
273 if (score != 100 && s1_.size() == s2.size()) {
274 score_cutoff = std::max(score_cutoff, score);
275 double score2 = fuzz_detail::partial_ratio_impl(s2, s1_, score_cutoff).score;
276 if (score2 > score)
return score2;
282template <
typename CharT1>
283template <
typename Sentence2>
284double CachedPartialRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
285 [[maybe_unused]]
double score_hint)
const
287 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
293template <
typename InputIt1,
typename InputIt2>
294double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
296 if (score_cutoff > 100)
return 0;
298 return ratio(detail::sorted_split(first1, last1).join(), detail::sorted_split(first2, last2).join(),
302template <
typename Sentence1,
typename Sentence2>
305 return token_sort_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
306 detail::to_end(s2), score_cutoff);
309template <
typename CharT1>
310template <
typename InputIt2>
311double CachedTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
312 [[maybe_unused]]
double score_hint)
const
314 if (score_cutoff > 100)
return 0;
316 return cached_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
319template <
typename CharT1>
320template <
typename Sentence2>
321double CachedTokenSortRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
322 [[maybe_unused]]
double score_hint)
const
324 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
331template <
typename InputIt1,
typename InputIt2>
335 if (score_cutoff > 100)
return 0;
337 return partial_ratio(detail::sorted_split(first1, last1).join(),
338 detail::sorted_split(first2, last2).join(), score_cutoff);
341template <
typename Sentence1,
typename Sentence2>
345 detail::to_end(s2), score_cutoff);
348template <
typename CharT1>
349template <
typename InputIt2>
350double CachedPartialTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
351 [[maybe_unused]]
double score_hint)
const
353 if (score_cutoff > 100)
return 0;
355 return cached_partial_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
358template <
typename CharT1>
359template <
typename Sentence2>
360double CachedPartialTokenSortRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
361 [[maybe_unused]]
double score_hint)
const
363 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
370namespace fuzz_detail {
371template <
typename InputIt1,
typename InputIt2>
372double token_set_ratio(
const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_a,
373 const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
374 const double score_cutoff)
378 if (tokens_a.empty() || tokens_b.empty())
return 0;
380 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
381 auto intersect = decomposition.intersection;
382 auto diff_ab = decomposition.difference_ab;
383 auto diff_ba = decomposition.difference_ba;
386 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty()))
return 100;
388 auto diff_ab_joined = diff_ab.join();
389 auto diff_ba_joined = diff_ba.join();
391 size_t ab_len = diff_ab_joined.size();
392 size_t ba_len = diff_ba_joined.size();
393 size_t sect_len = intersect.length();
396 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
397 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
400 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
401 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
403 if (dist <= cutoff_distance) result = norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff);
406 if (!sect_len)
return result;
411 size_t sect_ab_dist = bool(sect_len) + ab_len;
412 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
414 size_t sect_ba_dist = bool(sect_len) + ba_len;
415 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
417 return std::max({result, sect_ab_ratio, sect_ba_ratio});
421template <
typename InputIt1,
typename InputIt2>
422double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
424 if (score_cutoff > 100)
return 0;
427 detail::sorted_split(first2, last2), score_cutoff);
430template <
typename Sentence1,
typename Sentence2>
433 return token_set_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
437template <
typename CharT1>
438template <
typename InputIt2>
439double CachedTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
440 [[maybe_unused]]
double score_hint)
const
442 if (score_cutoff > 100)
return 0;
447template <
typename CharT1>
448template <
typename Sentence2>
449double CachedTokenSetRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
450 [[maybe_unused]]
double score_hint)
const
452 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
459namespace fuzz_detail {
460template <
typename InputIt1,
typename InputIt2>
462 const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
463 const double score_cutoff)
467 if (tokens_a.empty() || tokens_b.empty())
return 0;
469 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
472 if (!decomposition.intersection.empty())
return 100;
474 return partial_ratio(decomposition.difference_ab.join(), decomposition.difference_ba.join(),
479template <
typename InputIt1,
typename InputIt2>
483 if (score_cutoff > 100)
return 0;
486 detail::sorted_split(first2, last2), score_cutoff);
489template <
typename Sentence1,
typename Sentence2>
493 detail::to_end(s2), score_cutoff);
496template <
typename CharT1>
497template <
typename InputIt2>
498double CachedPartialTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
499 [[maybe_unused]]
double score_hint)
const
501 if (score_cutoff > 100)
return 0;
506template <
typename CharT1>
507template <
typename Sentence2>
508double CachedPartialTokenSetRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
509 [[maybe_unused]]
double score_hint)
const
511 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
518template <
typename InputIt1,
typename InputIt2>
519double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
521 if (score_cutoff > 100)
return 0;
523 auto tokens_a = detail::sorted_split(first1, last1);
524 auto tokens_b = detail::sorted_split(first2, last2);
526 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
527 auto intersect = decomposition.intersection;
528 auto diff_ab = decomposition.difference_ab;
529 auto diff_ba = decomposition.difference_ba;
531 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty()))
return 100;
533 auto diff_ab_joined = diff_ab.join();
534 auto diff_ba_joined = diff_ba.join();
536 size_t ab_len = diff_ab_joined.size();
537 size_t ba_len = diff_ba_joined.size();
538 size_t sect_len = intersect.length();
540 double result =
ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
543 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
544 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
546 size_t cutoff_distance = fuzz_detail::score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
547 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
548 if (dist <= cutoff_distance)
549 result = std::max(result, fuzz_detail::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
552 if (!sect_len)
return result;
557 size_t sect_ab_dist = bool(sect_len) + ab_len;
558 double sect_ab_ratio = fuzz_detail::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
560 size_t sect_ba_dist = bool(sect_len) + ba_len;
561 double sect_ba_ratio = fuzz_detail::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
563 return std::max({result, sect_ab_ratio, sect_ba_ratio});
566template <
typename Sentence1,
typename Sentence2>
567double token_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
569 return token_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
573namespace fuzz_detail {
574template <
typename CharT1,
typename CachedCharT1,
typename InputIt2>
575double token_ratio(
const rapidfuzz::detail::SplittedSentenceView<CharT1>& s1_tokens,
576 const CachedRatio<CachedCharT1>& cached_ratio_s1_sorted, InputIt2 first2, InputIt2 last2,
579 if (score_cutoff > 100)
return 0;
581 auto s2_tokens = detail::sorted_split(first2, last2);
583 auto decomposition = detail::set_decomposition(s1_tokens, s2_tokens);
584 auto intersect = decomposition.intersection;
585 auto diff_ab = decomposition.difference_ab;
586 auto diff_ba = decomposition.difference_ba;
588 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty()))
return 100;
590 auto diff_ab_joined = diff_ab.join();
591 auto diff_ba_joined = diff_ba.join();
593 size_t ab_len = diff_ab_joined.size();
594 size_t ba_len = diff_ba_joined.size();
595 size_t sect_len = intersect.length();
597 double result = cached_ratio_s1_sorted.similarity(s2_tokens.join(), score_cutoff);
600 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
601 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
603 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
604 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
605 if (dist <= cutoff_distance)
606 result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
609 if (!sect_len)
return result;
614 size_t sect_ab_dist = bool(sect_len) + ab_len;
615 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
617 size_t sect_ba_dist = bool(sect_len) + ba_len;
618 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
620 return std::max({result, sect_ab_ratio, sect_ba_ratio});
624template <
typename CharT1,
typename InputIt1,
typename InputIt2>
625double token_ratio(
const std::vector<CharT1>& s1_sorted,
626 const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
627 const detail::BlockPatternMatchVector& blockmap_s1_sorted, InputIt2 first2, InputIt2 last2,
630 if (score_cutoff > 100)
return 0;
632 auto tokens_b = detail::sorted_split(first2, last2);
634 auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
635 auto intersect = decomposition.intersection;
636 auto diff_ab = decomposition.difference_ab;
637 auto diff_ba = decomposition.difference_ba;
639 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty()))
return 100;
641 auto diff_ab_joined = diff_ab.join();
642 auto diff_ba_joined = diff_ba.join();
644 size_t ab_len = diff_ab_joined.size();
645 size_t ba_len = diff_ba_joined.size();
646 size_t sect_len = intersect.length();
649 auto s2_sorted = tokens_b.join();
650 if (s1_sorted.size() < 65) {
651 double norm_sim = detail::indel_normalized_similarity(blockmap_s1_sorted, detail::Range(s1_sorted),
652 detail::Range(s2_sorted), score_cutoff / 100);
653 result = norm_sim * 100;
656 result =
fuzz::ratio(s1_sorted, s2_sorted, score_cutoff);
660 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
661 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
663 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
664 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
665 if (dist <= cutoff_distance)
666 result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
669 if (!sect_len)
return result;
674 size_t sect_ab_dist = bool(sect_len) + ab_len;
675 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
677 size_t sect_ba_dist = bool(sect_len) + ba_len;
678 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
680 return std::max({result, sect_ab_ratio, sect_ba_ratio});
684template <
typename CharT1>
685template <
typename InputIt2>
686double CachedTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
687 [[maybe_unused]]
double score_hint)
const
692template <
typename CharT1>
693template <
typename Sentence2>
694double CachedTokenRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
695 [[maybe_unused]]
double score_hint)
const
697 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
704template <
typename InputIt1,
typename InputIt2>
705double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
708 if (score_cutoff > 100)
return 0;
710 auto tokens_a = detail::sorted_split(first1, last1);
711 auto tokens_b = detail::sorted_split(first2, last2);
713 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
716 if (!decomposition.intersection.empty())
return 100;
718 auto diff_ab = decomposition.difference_ab;
719 auto diff_ba = decomposition.difference_ba;
721 double result =
partial_ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
724 if (tokens_a.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
728 score_cutoff = std::max(score_cutoff, result);
729 return std::max(result,
partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
732template <
typename Sentence1,
typename Sentence2>
736 detail::to_end(s2), score_cutoff);
739namespace fuzz_detail {
740template <
typename CharT1,
typename InputIt1,
typename InputIt2>
742 const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
743 InputIt2 first2, InputIt2 last2,
double score_cutoff)
745 if (score_cutoff > 100)
return 0;
747 auto tokens_b = detail::sorted_split(first2, last2);
749 auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
752 if (!decomposition.intersection.empty())
return 100;
754 auto diff_ab = decomposition.difference_ab;
755 auto diff_ba = decomposition.difference_ba;
757 double result =
partial_ratio(s1_sorted, tokens_b.join(), score_cutoff);
760 if (tokens_s1.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
764 score_cutoff = std::max(score_cutoff, result);
765 return std::max(result,
partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
770template <
typename CharT1>
771template <
typename InputIt2>
772double CachedPartialTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
773 [[maybe_unused]]
double score_hint)
const
778template <
typename CharT1>
779template <
typename Sentence2>
780double CachedPartialTokenRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
781 [[maybe_unused]]
double score_hint)
const
783 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
790template <
typename InputIt1,
typename InputIt2>
791double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
793 if (score_cutoff > 100)
return 0;
795 constexpr double UNBASE_SCALE = 0.95;
797 auto len1 = std::distance(first1, last1);
798 auto len2 = std::distance(first2, last2);
802 if (!len1 || !len2)
return 0;
804 double len_ratio = (len1 > len2) ?
static_cast<double>(len1) /
static_cast<double>(len2)
805 :
static_cast<double>(len2) /
static_cast<double>(len1);
807 double end_ratio =
ratio(first1, last1, first2, last2, score_cutoff);
809 if (len_ratio < 1.5) {
810 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
811 return std::max(end_ratio,
token_ratio(first1, last1, first2, last2, score_cutoff) * UNBASE_SCALE);
814 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
816 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
818 std::max(end_ratio,
partial_ratio(first1, last1, first2, last2, score_cutoff) * PARTIAL_SCALE);
820 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
821 return std::max(end_ratio,
partial_token_ratio(first1, last1, first2, last2, score_cutoff) *
822 UNBASE_SCALE * PARTIAL_SCALE);
825template <
typename Sentence1,
typename Sentence2>
826double WRatio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
828 return WRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
832template <
typename Sentence1>
833template <
typename InputIt1>
834CachedWRatio<Sentence1>::CachedWRatio(InputIt1 first1, InputIt1 last1)
836 cached_partial_ratio(first1, last1),
837 tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))),
838 s1_sorted(tokens_s1.join()),
839 blockmap_s1_sorted(detail::Range(s1_sorted))
842template <
typename CharT1>
843template <
typename InputIt2>
844double CachedWRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
845 [[maybe_unused]]
double score_hint)
const
847 if (score_cutoff > 100)
return 0;
849 constexpr double UNBASE_SCALE = 0.95;
851 size_t len1 = s1.size();
852 size_t len2 =
static_cast<size_t>(std::distance(first2, last2));
856 if (!len1 || !len2)
return 0;
858 double len_ratio = (len1 > len2) ?
static_cast<double>(len1) /
static_cast<double>(len2)
859 :
static_cast<double>(len2) /
static_cast<double>(len1);
861 double end_ratio = cached_partial_ratio.cached_ratio.similarity(first2, last2, score_cutoff);
863 if (len_ratio < 1.5) {
864 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
868 return std::max(end_ratio, r * UNBASE_SCALE);
871 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
873 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
875 std::max(end_ratio, cached_partial_ratio.similarity(first2, last2, score_cutoff) * PARTIAL_SCALE);
877 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
879 return std::max(end_ratio, r * UNBASE_SCALE * PARTIAL_SCALE);
882template <
typename CharT1>
883template <
typename Sentence2>
884double CachedWRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
885 [[maybe_unused]]
double score_hint)
const
887 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
894template <
typename InputIt1,
typename InputIt2>
895double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
897 ptrdiff_t len1 = std::distance(first1, last1);
898 ptrdiff_t len2 = std::distance(first2, last2);
902 if (!len1 || !len2)
return 0;
904 return ratio(first1, last1, first2, last2, score_cutoff);
907template <
typename Sentence1,
typename Sentence2>
908double QRatio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
910 return QRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
914template <
typename CharT1>
915template <
typename InputIt2>
916double CachedQRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
917 [[maybe_unused]]
double score_hint)
const
919 auto len2 = std::distance(first2, last2);
923 if (s1.empty() || !len2)
return 0;
925 return cached_ratio.similarity(first2, last2, score_cutoff);
928template <
typename CharT1>
929template <
typename Sentence2>
930double CachedQRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
931 [[maybe_unused]]
double score_hint)
const
933 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
double WRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a weighted ratio based on the other ratio algorithms.
Definition: fuzz_impl.hpp:826
double ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates a simple ratio between two strings
Definition: fuzz_impl.hpp:27
double QRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a quick ratio between two strings using fuzz.ratio.
Definition: fuzz_impl.hpp:908
double partial_token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::partial_r...
Definition: fuzz_impl.hpp:490
double token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::token_set_ratio and fuzz::token_sort_ratio (faster th...
Definition: fuzz_impl.hpp:567
double partial_token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::partial_token_set_ratio and fuzz::partial_token_sort_...
Definition: fuzz_impl.hpp:733
double token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::ratio.
Definition: fuzz_impl.hpp:431
double partial_token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::partial_ratio between them.
Definition: fuzz_impl.hpp:342
double token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::ratio between them.
Definition: fuzz_impl.hpp:303
double partial_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates the fuzz::ratio of the optimal string alignment
Definition: fuzz_impl.hpp:240