RapidFuzz
fuzz_impl.hpp
1/* SPDX-License-Identifier: MIT */
2/* Copyright © 2021-present Max Bachmann */
3/* Copyright © 2011 Adam Cohen */
4
5#include <limits>
6#include <rapidfuzz/details/CharSet.hpp>
7
8#include <algorithm>
9#include <cmath>
10#include <iterator>
11#include <sys/types.h>
12#include <vector>
13
14namespace rapidfuzz::fuzz {
15
16/**********************************************
17 * ratio
18 *********************************************/
19
20template <typename InputIt1, typename InputIt2>
21double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
22{
23 return ratio(detail::Range(first1, last1), detail::Range(first2, last2), score_cutoff);
24}
25
26template <typename Sentence1, typename Sentence2>
27double ratio(const Sentence1& s1, const Sentence2& s2, const double score_cutoff)
28{
29 return indel_normalized_similarity(s1, s2, score_cutoff / 100) * 100;
30}
31
32template <typename CharT1>
33template <typename InputIt2>
34double CachedRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
35 double score_hint) const
36{
37 return similarity(detail::Range(first2, last2), score_cutoff, score_hint);
38}
39
40template <typename CharT1>
41template <typename Sentence2>
42double CachedRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff, double score_hint) const
43{
44 return cached_indel.normalized_similarity(s2, score_cutoff / 100, score_hint / 100) * 100;
45}
46
47/**********************************************
48 * partial_ratio
49 *********************************************/
50
51namespace fuzz_detail {
52
53static constexpr double norm_distance(size_t dist, size_t lensum, double score_cutoff = 0)
54{
55 double score =
56 (lensum > 0) ? (100.0 - 100.0 * static_cast<double>(dist) / static_cast<double>(lensum)) : 100.0;
57
58 return (score >= score_cutoff) ? score : 0;
59}
60
61static inline size_t score_cutoff_to_distance(double score_cutoff, size_t lensum)
62{
63 return static_cast<size_t>(std::ceil(static_cast<double>(lensum) * (1.0 - score_cutoff / 100)));
64}
65
66template <typename InputIt1, typename InputIt2, typename CachedCharT1>
67ScoreAlignment<double>
68partial_ratio_impl(const detail::Range<InputIt1>& s1, const detail::Range<InputIt2>& s2,
69 const CachedRatio<CachedCharT1>& cached_ratio,
70 const detail::CharSet<iter_value_t<InputIt1>>& s1_char_set, double score_cutoff)
71{
72 ScoreAlignment<double> res;
73 size_t len1 = s1.size();
74 size_t len2 = s2.size();
75 res.src_start = 0;
76 res.src_end = len1;
77 res.dest_start = 0;
78 res.dest_end = len1;
79
80 if (len2 > len1) {
81 size_t maximum = len1 * 2;
82 double norm_cutoff_sim = rapidfuzz::detail::NormSim_to_NormDist(score_cutoff / 100);
83 size_t cutoff_dist = static_cast<size_t>(std::ceil(static_cast<double>(maximum) * norm_cutoff_sim));
84 size_t best_dist = std::numeric_limits<size_t>::max();
85 std::vector<size_t> scores(len2 - len1, std::numeric_limits<size_t>::max());
86 std::vector<std::pair<size_t, size_t>> windows = {{0, len2 - len1 - 1}};
87 std::vector<std::pair<size_t, size_t>> new_windows;
88
89 while (!windows.empty()) {
90 for (const auto& window : windows) {
91 auto subseq1_first = s2.begin() + static_cast<ptrdiff_t>(window.first);
92 auto subseq2_first = s2.begin() + static_cast<ptrdiff_t>(window.second);
93 detail::Range subseq1(subseq1_first, subseq1_first + static_cast<ptrdiff_t>(len1));
94 detail::Range subseq2(subseq2_first, subseq2_first + static_cast<ptrdiff_t>(len1));
95
96 if (scores[window.first] == std::numeric_limits<size_t>::max()) {
97 scores[window.first] = cached_ratio.cached_indel.distance(subseq1);
98 if (scores[window.first] < cutoff_dist) {
99 cutoff_dist = best_dist = scores[window.first];
100 res.dest_start = window.first;
101 res.dest_end = window.first + len1;
102 if (best_dist == 0) {
103 res.score = 100;
104 return res;
105 }
106 }
107 }
108 if (scores[window.second] == std::numeric_limits<size_t>::max()) {
109 scores[window.second] = cached_ratio.cached_indel.distance(subseq2);
110 if (scores[window.second] < cutoff_dist) {
111 cutoff_dist = best_dist = scores[window.second];
112 res.dest_start = window.second;
113 res.dest_end = window.second + len1;
114 if (best_dist == 0) {
115 res.score = 100;
116 return res;
117 }
118 }
119 }
120
121 size_t cell_diff = window.second - window.first;
122 if (cell_diff == 1) continue;
123
124 /* find the minimum score possible in the range first <-> last */
125 size_t known_edits = detail::abs_diff(scores[window.first], scores[window.second]);
126 /* half of the cells that are not needed for known_edits can lead to a better score */
127 ptrdiff_t min_score =
128 static_cast<ptrdiff_t>(std::min(scores[window.first], scores[window.second])) -
129 static_cast<ptrdiff_t>(cell_diff + known_edits / 2);
130 if (min_score < static_cast<ptrdiff_t>(cutoff_dist)) {
131 size_t center = cell_diff / 2;
132 new_windows.emplace_back(window.first, window.first + center);
133 new_windows.emplace_back(window.first + center, window.second);
134 }
135 }
136
137 std::swap(windows, new_windows);
138 new_windows.clear();
139 }
140
141 double score = 1.0 - (static_cast<double>(best_dist) / static_cast<double>(maximum));
142 score *= 100;
143 if (score >= score_cutoff) score_cutoff = res.score = score;
144 }
145
146 for (size_t i = 1; i < len1; ++i) {
147 rapidfuzz::detail::Range subseq(s2.begin(), s2.begin() + static_cast<ptrdiff_t>(i));
148 if (!s1_char_set.find(subseq.back())) continue;
149
150 double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
151 if (ls_ratio > res.score) {
152 score_cutoff = res.score = ls_ratio;
153 res.dest_start = 0;
154 res.dest_end = i;
155 if (res.score == 100.0) return res;
156 }
157 }
158
159 for (size_t i = len2 - len1; i < len2; ++i) {
160 rapidfuzz::detail::Range subseq(s2.begin() + static_cast<ptrdiff_t>(i), s2.end());
161 if (!s1_char_set.find(subseq.front())) continue;
162
163 double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
164 if (ls_ratio > res.score) {
165 score_cutoff = res.score = ls_ratio;
166 res.dest_start = i;
167 res.dest_end = len2;
168 if (res.score == 100.0) return res;
169 }
170 }
171
172 return res;
173}
174
175template <typename InputIt1, typename InputIt2, typename CharT1 = iter_value_t<InputIt1>>
176ScoreAlignment<double> partial_ratio_impl(const detail::Range<InputIt1>& s1,
177 const detail::Range<InputIt2>& s2, double score_cutoff)
178{
179 CachedRatio<CharT1> cached_ratio(s1);
180
181 detail::CharSet<CharT1> s1_char_set;
182 for (auto ch : s1)
183 s1_char_set.insert(ch);
184
185 return partial_ratio_impl(s1, s2, cached_ratio, s1_char_set, score_cutoff);
186}
187
188} // namespace fuzz_detail
189
190template <typename InputIt1, typename InputIt2>
191ScoreAlignment<double> partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2,
192 InputIt2 last2, double score_cutoff)
193{
194 size_t len1 = static_cast<size_t>(std::distance(first1, last1));
195 size_t len2 = static_cast<size_t>(std::distance(first2, last2));
196
197 if (len1 > len2) {
198 ScoreAlignment<double> result = partial_ratio_alignment(first2, last2, first1, last1, score_cutoff);
199 std::swap(result.src_start, result.dest_start);
200 std::swap(result.src_end, result.dest_end);
201 return result;
202 }
203
204 if (score_cutoff > 100) return ScoreAlignment<double>(0, 0, len1, 0, len1);
205
206 if (!len1 || !len2)
207 return ScoreAlignment<double>(static_cast<double>(len1 == len2) * 100.0, 0, len1, 0, len1);
208
209 auto s1 = detail::Range(first1, last1);
210 auto s2 = detail::Range(first2, last2);
211
212 auto alignment = fuzz_detail::partial_ratio_impl(s1, s2, score_cutoff);
213 if (alignment.score != 100 && s1.size() == s2.size()) {
214 score_cutoff = std::max(score_cutoff, alignment.score);
215 auto alignment2 = fuzz_detail::partial_ratio_impl(s2, s1, score_cutoff);
216 if (alignment2.score > alignment.score) {
217 std::swap(alignment2.src_start, alignment2.dest_start);
218 std::swap(alignment2.src_end, alignment2.dest_end);
219 return alignment2;
220 }
221 }
222
223 return alignment;
224}
225
226template <typename Sentence1, typename Sentence2>
227ScoreAlignment<double> partial_ratio_alignment(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
228{
229 return partial_ratio_alignment(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
230 detail::to_end(s2), score_cutoff);
231}
232
233template <typename InputIt1, typename InputIt2>
234double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
235{
236 return partial_ratio_alignment(first1, last1, first2, last2, score_cutoff).score;
237}
238
239template <typename Sentence1, typename Sentence2>
240double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
241{
242 return partial_ratio_alignment(s1, s2, score_cutoff).score;
243}
244
245template <typename CharT1>
246template <typename InputIt1>
247CachedPartialRatio<CharT1>::CachedPartialRatio(InputIt1 first1, InputIt1 last1)
248 : s1(first1, last1), cached_ratio(first1, last1)
249{
250 for (const auto& ch : s1)
251 s1_char_set.insert(ch);
252}
253
254template <typename CharT1>
255template <typename InputIt2>
256double CachedPartialRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
257 [[maybe_unused]] double score_hint) const
258{
259 size_t len1 = s1.size();
260 size_t len2 = static_cast<size_t>(std::distance(first2, last2));
261
262 if (len1 > len2)
263 return partial_ratio(detail::to_begin(s1), detail::to_end(s1), first2, last2, score_cutoff);
264
265 if (score_cutoff > 100) return 0;
266
267 if (!len1 || !len2) return static_cast<double>(len1 == len2) * 100.0;
268
269 auto s1_ = detail::Range(s1);
270 auto s2 = detail::Range(first2, last2);
271
272 double score = fuzz_detail::partial_ratio_impl(s1_, s2, cached_ratio, s1_char_set, score_cutoff).score;
273 if (score != 100 && s1_.size() == s2.size()) {
274 score_cutoff = std::max(score_cutoff, score);
275 double score2 = fuzz_detail::partial_ratio_impl(s2, s1_, score_cutoff).score;
276 if (score2 > score) return score2;
277 }
278
279 return score;
280}
281
282template <typename CharT1>
283template <typename Sentence2>
284double CachedPartialRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
285 [[maybe_unused]] double score_hint) const
286{
287 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
288}
289
290/**********************************************
291 * token_sort_ratio
292 *********************************************/
293template <typename InputIt1, typename InputIt2>
294double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
295{
296 if (score_cutoff > 100) return 0;
297
298 return ratio(detail::sorted_split(first1, last1).join(), detail::sorted_split(first2, last2).join(),
299 score_cutoff);
300}
301
302template <typename Sentence1, typename Sentence2>
303double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
304{
305 return token_sort_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
306 detail::to_end(s2), score_cutoff);
307}
308
309template <typename CharT1>
310template <typename InputIt2>
311double CachedTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
312 [[maybe_unused]] double score_hint) const
313{
314 if (score_cutoff > 100) return 0;
315
316 return cached_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
317}
318
319template <typename CharT1>
320template <typename Sentence2>
321double CachedTokenSortRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
322 [[maybe_unused]] double score_hint) const
323{
324 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
325}
326
327/**********************************************
328 * partial_token_sort_ratio
329 *********************************************/
330
331template <typename InputIt1, typename InputIt2>
332double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
333 double score_cutoff)
334{
335 if (score_cutoff > 100) return 0;
336
337 return partial_ratio(detail::sorted_split(first1, last1).join(),
338 detail::sorted_split(first2, last2).join(), score_cutoff);
339}
340
341template <typename Sentence1, typename Sentence2>
342double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
343{
344 return partial_token_sort_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
345 detail::to_end(s2), score_cutoff);
346}
347
348template <typename CharT1>
349template <typename InputIt2>
350double CachedPartialTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
351 [[maybe_unused]] double score_hint) const
352{
353 if (score_cutoff > 100) return 0;
354
355 return cached_partial_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
356}
357
358template <typename CharT1>
359template <typename Sentence2>
360double CachedPartialTokenSortRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
361 [[maybe_unused]] double score_hint) const
362{
363 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
364}
365
366/**********************************************
367 * token_set_ratio
368 *********************************************/
369
370namespace fuzz_detail {
371template <typename InputIt1, typename InputIt2>
372double token_set_ratio(const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_a,
373 const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
374 const double score_cutoff)
375{
376 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
377 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
378 if (tokens_a.empty() || tokens_b.empty()) return 0;
379
380 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
381 auto intersect = decomposition.intersection;
382 auto diff_ab = decomposition.difference_ab;
383 auto diff_ba = decomposition.difference_ba;
384
385 // one sentence is part of the other one
386 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
387
388 auto diff_ab_joined = diff_ab.join();
389 auto diff_ba_joined = diff_ba.join();
390
391 size_t ab_len = diff_ab_joined.size();
392 size_t ba_len = diff_ba_joined.size();
393 size_t sect_len = intersect.length();
394
395 // string length sect+ab <-> sect and sect+ba <-> sect
396 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
397 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
398
399 double result = 0;
400 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
401 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
402
403 if (dist <= cutoff_distance) result = norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff);
404
405 // exit early since the other ratios are 0
406 if (!sect_len) return result;
407
408 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
409 // since only sect is similar in them the distance can be calculated based on
410 // the length difference
411 size_t sect_ab_dist = bool(sect_len) + ab_len;
412 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
413
414 size_t sect_ba_dist = bool(sect_len) + ba_len;
415 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
416
417 return std::max({result, sect_ab_ratio, sect_ba_ratio});
418}
419} // namespace fuzz_detail
420
421template <typename InputIt1, typename InputIt2>
422double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
423{
424 if (score_cutoff > 100) return 0;
425
426 return fuzz_detail::token_set_ratio(detail::sorted_split(first1, last1),
427 detail::sorted_split(first2, last2), score_cutoff);
428}
429
430template <typename Sentence1, typename Sentence2>
431double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
432{
433 return token_set_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
434 score_cutoff);
435}
436
437template <typename CharT1>
438template <typename InputIt2>
439double CachedTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
440 [[maybe_unused]] double score_hint) const
441{
442 if (score_cutoff > 100) return 0;
443
444 return fuzz_detail::token_set_ratio(tokens_s1, detail::sorted_split(first2, last2), score_cutoff);
445}
446
447template <typename CharT1>
448template <typename Sentence2>
449double CachedTokenSetRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
450 [[maybe_unused]] double score_hint) const
451{
452 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
453}
454
455/**********************************************
456 * partial_token_set_ratio
457 *********************************************/
458
459namespace fuzz_detail {
460template <typename InputIt1, typename InputIt2>
461double partial_token_set_ratio(const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_a,
462 const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
463 const double score_cutoff)
464{
465 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
466 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
467 if (tokens_a.empty() || tokens_b.empty()) return 0;
468
469 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
470
471 // exit early when there is a common word in both sequences
472 if (!decomposition.intersection.empty()) return 100;
473
474 return partial_ratio(decomposition.difference_ab.join(), decomposition.difference_ba.join(),
475 score_cutoff);
476}
477} // namespace fuzz_detail
478
479template <typename InputIt1, typename InputIt2>
480double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
481 double score_cutoff)
482{
483 if (score_cutoff > 100) return 0;
484
485 return fuzz_detail::partial_token_set_ratio(detail::sorted_split(first1, last1),
486 detail::sorted_split(first2, last2), score_cutoff);
487}
488
489template <typename Sentence1, typename Sentence2>
490double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
491{
492 return partial_token_set_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
493 detail::to_end(s2), score_cutoff);
494}
495
496template <typename CharT1>
497template <typename InputIt2>
498double CachedPartialTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
499 [[maybe_unused]] double score_hint) const
500{
501 if (score_cutoff > 100) return 0;
502
503 return fuzz_detail::partial_token_set_ratio(tokens_s1, detail::sorted_split(first2, last2), score_cutoff);
504}
505
506template <typename CharT1>
507template <typename Sentence2>
508double CachedPartialTokenSetRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
509 [[maybe_unused]] double score_hint) const
510{
511 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
512}
513
514/**********************************************
515 * token_ratio
516 *********************************************/
517
518template <typename InputIt1, typename InputIt2>
519double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
520{
521 if (score_cutoff > 100) return 0;
522
523 auto tokens_a = detail::sorted_split(first1, last1);
524 auto tokens_b = detail::sorted_split(first2, last2);
525
526 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
527 auto intersect = decomposition.intersection;
528 auto diff_ab = decomposition.difference_ab;
529 auto diff_ba = decomposition.difference_ba;
530
531 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
532
533 auto diff_ab_joined = diff_ab.join();
534 auto diff_ba_joined = diff_ba.join();
535
536 size_t ab_len = diff_ab_joined.size();
537 size_t ba_len = diff_ba_joined.size();
538 size_t sect_len = intersect.length();
539
540 double result = ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
541
542 // string length sect+ab <-> sect and sect+ba <-> sect
543 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
544 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
545
546 size_t cutoff_distance = fuzz_detail::score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
547 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
548 if (dist <= cutoff_distance)
549 result = std::max(result, fuzz_detail::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
550
551 // exit early since the other ratios are 0
552 if (!sect_len) return result;
553
554 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
555 // since only sect is similar in them the distance can be calculated based on
556 // the length difference
557 size_t sect_ab_dist = bool(sect_len) + ab_len;
558 double sect_ab_ratio = fuzz_detail::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
559
560 size_t sect_ba_dist = bool(sect_len) + ba_len;
561 double sect_ba_ratio = fuzz_detail::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
562
563 return std::max({result, sect_ab_ratio, sect_ba_ratio});
564}
565
566template <typename Sentence1, typename Sentence2>
567double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
568{
569 return token_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
570 score_cutoff);
571}
572
573namespace fuzz_detail {
574template <typename CharT1, typename CachedCharT1, typename InputIt2>
575double token_ratio(const rapidfuzz::detail::SplittedSentenceView<CharT1>& s1_tokens,
576 const CachedRatio<CachedCharT1>& cached_ratio_s1_sorted, InputIt2 first2, InputIt2 last2,
577 double score_cutoff)
578{
579 if (score_cutoff > 100) return 0;
580
581 auto s2_tokens = detail::sorted_split(first2, last2);
582
583 auto decomposition = detail::set_decomposition(s1_tokens, s2_tokens);
584 auto intersect = decomposition.intersection;
585 auto diff_ab = decomposition.difference_ab;
586 auto diff_ba = decomposition.difference_ba;
587
588 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
589
590 auto diff_ab_joined = diff_ab.join();
591 auto diff_ba_joined = diff_ba.join();
592
593 size_t ab_len = diff_ab_joined.size();
594 size_t ba_len = diff_ba_joined.size();
595 size_t sect_len = intersect.length();
596
597 double result = cached_ratio_s1_sorted.similarity(s2_tokens.join(), score_cutoff);
598
599 // string length sect+ab <-> sect and sect+ba <-> sect
600 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
601 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
602
603 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
604 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
605 if (dist <= cutoff_distance)
606 result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
607
608 // exit early since the other ratios are 0
609 if (!sect_len) return result;
610
611 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
612 // since only sect is similar in them the distance can be calculated based on
613 // the length difference
614 size_t sect_ab_dist = bool(sect_len) + ab_len;
615 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
616
617 size_t sect_ba_dist = bool(sect_len) + ba_len;
618 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
619
620 return std::max({result, sect_ab_ratio, sect_ba_ratio});
621}
622
623// todo this is a temporary solution until WRatio is properly implemented using other scorers
624template <typename CharT1, typename InputIt1, typename InputIt2>
625double token_ratio(const std::vector<CharT1>& s1_sorted,
626 const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
627 const detail::BlockPatternMatchVector& blockmap_s1_sorted, InputIt2 first2, InputIt2 last2,
628 double score_cutoff)
629{
630 if (score_cutoff > 100) return 0;
631
632 auto tokens_b = detail::sorted_split(first2, last2);
633
634 auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
635 auto intersect = decomposition.intersection;
636 auto diff_ab = decomposition.difference_ab;
637 auto diff_ba = decomposition.difference_ba;
638
639 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty())) return 100;
640
641 auto diff_ab_joined = diff_ab.join();
642 auto diff_ba_joined = diff_ba.join();
643
644 size_t ab_len = diff_ab_joined.size();
645 size_t ba_len = diff_ba_joined.size();
646 size_t sect_len = intersect.length();
647
648 double result = 0;
649 auto s2_sorted = tokens_b.join();
650 if (s1_sorted.size() < 65) {
651 double norm_sim = detail::indel_normalized_similarity(blockmap_s1_sorted, detail::Range(s1_sorted),
652 detail::Range(s2_sorted), score_cutoff / 100);
653 result = norm_sim * 100;
654 }
655 else {
656 result = fuzz::ratio(s1_sorted, s2_sorted, score_cutoff);
657 }
658
659 // string length sect+ab <-> sect and sect+ba <-> sect
660 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
661 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
662
663 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
664 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
665 if (dist <= cutoff_distance)
666 result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
667
668 // exit early since the other ratios are 0
669 if (!sect_len) return result;
670
671 // levenshtein distance sect+ab <-> sect and sect+ba <-> sect
672 // since only sect is similar in them the distance can be calculated based on
673 // the length difference
674 size_t sect_ab_dist = bool(sect_len) + ab_len;
675 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
676
677 size_t sect_ba_dist = bool(sect_len) + ba_len;
678 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
679
680 return std::max({result, sect_ab_ratio, sect_ba_ratio});
681}
682} // namespace fuzz_detail
683
684template <typename CharT1>
685template <typename InputIt2>
686double CachedTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
687 [[maybe_unused]] double score_hint) const
688{
689 return fuzz_detail::token_ratio(s1_tokens, cached_ratio_s1_sorted, first2, last2, score_cutoff);
690}
691
692template <typename CharT1>
693template <typename Sentence2>
694double CachedTokenRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
695 [[maybe_unused]] double score_hint) const
696{
697 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
698}
699
700/**********************************************
701 * partial_token_ratio
702 *********************************************/
703
704template <typename InputIt1, typename InputIt2>
705double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
706 double score_cutoff)
707{
708 if (score_cutoff > 100) return 0;
709
710 auto tokens_a = detail::sorted_split(first1, last1);
711 auto tokens_b = detail::sorted_split(first2, last2);
712
713 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
714
715 // exit early when there is a common word in both sequences
716 if (!decomposition.intersection.empty()) return 100;
717
718 auto diff_ab = decomposition.difference_ab;
719 auto diff_ba = decomposition.difference_ba;
720
721 double result = partial_ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
722
723 // do not calculate the same partial_ratio twice
724 if (tokens_a.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
725 return result;
726 }
727
728 score_cutoff = std::max(score_cutoff, result);
729 return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
730}
731
732template <typename Sentence1, typename Sentence2>
733double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
734{
735 return partial_token_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
736 detail::to_end(s2), score_cutoff);
737}
738
739namespace fuzz_detail {
740template <typename CharT1, typename InputIt1, typename InputIt2>
741double partial_token_ratio(const std::vector<CharT1>& s1_sorted,
742 const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
743 InputIt2 first2, InputIt2 last2, double score_cutoff)
744{
745 if (score_cutoff > 100) return 0;
746
747 auto tokens_b = detail::sorted_split(first2, last2);
748
749 auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
750
751 // exit early when there is a common word in both sequences
752 if (!decomposition.intersection.empty()) return 100;
753
754 auto diff_ab = decomposition.difference_ab;
755 auto diff_ba = decomposition.difference_ba;
756
757 double result = partial_ratio(s1_sorted, tokens_b.join(), score_cutoff);
758
759 // do not calculate the same partial_ratio twice
760 if (tokens_s1.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
761 return result;
762 }
763
764 score_cutoff = std::max(score_cutoff, result);
765 return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
766}
767
768} // namespace fuzz_detail
769
770template <typename CharT1>
771template <typename InputIt2>
772double CachedPartialTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
773 [[maybe_unused]] double score_hint) const
774{
775 return fuzz_detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
776}
777
778template <typename CharT1>
779template <typename Sentence2>
780double CachedPartialTokenRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
781 [[maybe_unused]] double score_hint) const
782{
783 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
784}
785
786/**********************************************
787 * WRatio
788 *********************************************/
789
790template <typename InputIt1, typename InputIt2>
791double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
792{
793 if (score_cutoff > 100) return 0;
794
795 constexpr double UNBASE_SCALE = 0.95;
796
797 auto len1 = std::distance(first1, last1);
798 auto len2 = std::distance(first2, last2);
799
800 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
801 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
802 if (!len1 || !len2) return 0;
803
804 double len_ratio = (len1 > len2) ? static_cast<double>(len1) / static_cast<double>(len2)
805 : static_cast<double>(len2) / static_cast<double>(len1);
806
807 double end_ratio = ratio(first1, last1, first2, last2, score_cutoff);
808
809 if (len_ratio < 1.5) {
810 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
811 return std::max(end_ratio, token_ratio(first1, last1, first2, last2, score_cutoff) * UNBASE_SCALE);
812 }
813
814 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
815
816 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
817 end_ratio =
818 std::max(end_ratio, partial_ratio(first1, last1, first2, last2, score_cutoff) * PARTIAL_SCALE);
819
820 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
821 return std::max(end_ratio, partial_token_ratio(first1, last1, first2, last2, score_cutoff) *
822 UNBASE_SCALE * PARTIAL_SCALE);
823}
824
825template <typename Sentence1, typename Sentence2>
826double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
827{
828 return WRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
829 score_cutoff);
830}
831
832template <typename Sentence1>
833template <typename InputIt1>
834CachedWRatio<Sentence1>::CachedWRatio(InputIt1 first1, InputIt1 last1)
835 : s1(first1, last1),
836 cached_partial_ratio(first1, last1),
837 tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))),
838 s1_sorted(tokens_s1.join()),
839 blockmap_s1_sorted(detail::Range(s1_sorted))
840{}
841
842template <typename CharT1>
843template <typename InputIt2>
844double CachedWRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
845 [[maybe_unused]] double score_hint) const
846{
847 if (score_cutoff > 100) return 0;
848
849 constexpr double UNBASE_SCALE = 0.95;
850
851 size_t len1 = s1.size();
852 size_t len2 = static_cast<size_t>(std::distance(first2, last2));
853
854 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
855 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
856 if (!len1 || !len2) return 0;
857
858 double len_ratio = (len1 > len2) ? static_cast<double>(len1) / static_cast<double>(len2)
859 : static_cast<double>(len2) / static_cast<double>(len1);
860
861 double end_ratio = cached_partial_ratio.cached_ratio.similarity(first2, last2, score_cutoff);
862
863 if (len_ratio < 1.5) {
864 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
865 // use pre calculated values
866 auto r =
867 fuzz_detail::token_ratio(s1_sorted, tokens_s1, blockmap_s1_sorted, first2, last2, score_cutoff);
868 return std::max(end_ratio, r * UNBASE_SCALE);
869 }
870
871 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
872
873 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
874 end_ratio =
875 std::max(end_ratio, cached_partial_ratio.similarity(first2, last2, score_cutoff) * PARTIAL_SCALE);
876
877 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
878 auto r = fuzz_detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
879 return std::max(end_ratio, r * UNBASE_SCALE * PARTIAL_SCALE);
880}
881
882template <typename CharT1>
883template <typename Sentence2>
884double CachedWRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
885 [[maybe_unused]] double score_hint) const
886{
887 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
888}
889
890/**********************************************
891 * QRatio
892 *********************************************/
893
894template <typename InputIt1, typename InputIt2>
895double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff)
896{
897 ptrdiff_t len1 = std::distance(first1, last1);
898 ptrdiff_t len2 = std::distance(first2, last2);
899
900 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
901 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
902 if (!len1 || !len2) return 0;
903
904 return ratio(first1, last1, first2, last2, score_cutoff);
905}
906
907template <typename Sentence1, typename Sentence2>
908double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff)
909{
910 return QRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
911 score_cutoff);
912}
913
914template <typename CharT1>
915template <typename InputIt2>
916double CachedQRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2, double score_cutoff,
917 [[maybe_unused]] double score_hint) const
918{
919 auto len2 = std::distance(first2, last2);
920
921 /* in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
922 * see https://github.com/rapidfuzz/RapidFuzz/issues/110 */
923 if (s1.empty() || !len2) return 0;
924
925 return cached_ratio.similarity(first2, last2, score_cutoff);
926}
927
928template <typename CharT1>
929template <typename Sentence2>
930double CachedQRatio<CharT1>::similarity(const Sentence2& s2, double score_cutoff,
931 [[maybe_unused]] double score_hint) const
932{
933 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
934}
935
936} // namespace rapidfuzz::fuzz
double WRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a weighted ratio based on the other ratio algorithms.
Definition: fuzz_impl.hpp:826
double ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates a simple ratio between two strings
Definition: fuzz_impl.hpp:27
double QRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a quick ratio between two strings using fuzz.ratio.
Definition: fuzz_impl.hpp:908
double partial_token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::partial_r...
Definition: fuzz_impl.hpp:490
double token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::token_set_ratio and fuzz::token_sort_ratio (faster th...
Definition: fuzz_impl.hpp:567
double partial_token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::partial_token_set_ratio and fuzz::partial_token_sort_...
Definition: fuzz_impl.hpp:733
double token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::ratio.
Definition: fuzz_impl.hpp:431
double partial_token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::partial_ratio between them.
Definition: fuzz_impl.hpp:342
double token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::ratio between them.
Definition: fuzz_impl.hpp:303
double partial_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates the fuzz::ratio of the optimal string alignment
Definition: fuzz_impl.hpp:240