RapidFuzz
fuzz.hpp
1/* SPDX-License-Identifier: MIT */
2/* Copyright © 2021 Max Bachmann */
3/* Copyright © 2011 Adam Cohen */
4
5#pragma once
6#include <rapidfuzz/details/CharSet.hpp>
7#include <rapidfuzz/details/PatternMatchVector.hpp>
8#include <rapidfuzz/details/common.hpp>
9#include <rapidfuzz/distance/Indel.hpp>
10
11namespace rapidfuzz::fuzz {
12
43template <typename Sentence1, typename Sentence2>
44double ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
45
46template <typename InputIt1, typename InputIt2>
47double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
48
49#ifdef RAPIDFUZZ_SIMD
50namespace experimental {
51template <int MaxLen>
52struct MultiRatio {
53public:
54 MultiRatio(size_t count) : input_count(count), scorer(count)
55 {}
56
57 size_t result_count() const
58 {
59 return scorer.result_count();
60 }
61
62 template <typename Sentence1>
63 void insert(const Sentence1& s1_)
64 {
65 insert(detail::to_begin(s1_), detail::to_end(s1_));
66 }
67
68 template <typename InputIt1>
69 void insert(InputIt1 first1, InputIt1 last1)
70 {
71 scorer.insert(first1, last1);
72 }
73
74 template <typename InputIt2>
75 void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
76 double score_cutoff = 0.0) const
77 {
78 similarity(scores, score_count, detail::Range(first2, last2), score_cutoff);
79 }
80
81 template <typename Sentence2>
82 void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
83 {
84 scorer.normalized_similarity(scores, score_count, s2, score_cutoff / 100.0);
85
86 for (size_t i = 0; i < input_count; ++i)
87 scores[i] *= 100.0;
88 }
89
90private:
91 size_t input_count;
92 rapidfuzz::experimental::MultiIndel<MaxLen> scorer;
93};
94} /* namespace experimental */
95#endif
96
97// TODO documentation
98template <typename CharT1>
99struct CachedRatio {
100 template <typename InputIt1>
101 CachedRatio(InputIt1 first1, InputIt1 last1) : cached_indel(first1, last1)
102 {}
103
104 template <typename Sentence1>
105 CachedRatio(const Sentence1& s1) : cached_indel(s1)
106 {}
107
108 template <typename InputIt2>
109 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
110 double score_hint = 0.0) const;
111
112 template <typename Sentence2>
113 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
114
115 // private:
116 CachedIndel<CharT1> cached_indel;
117};
118
119template <typename Sentence1>
120CachedRatio(const Sentence1& s1) -> CachedRatio<char_type<Sentence1>>;
121
122template <typename InputIt1>
123CachedRatio(InputIt1 first1, InputIt1 last1) -> CachedRatio<iter_value_t<InputIt1>>;
124
125template <typename InputIt1, typename InputIt2>
126ScoreAlignment<double> partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2,
127 InputIt2 last2, double score_cutoff = 0);
128
129template <typename Sentence1, typename Sentence2>
130ScoreAlignment<double> partial_ratio_alignment(const Sentence1& s1, const Sentence2& s2,
131 double score_cutoff = 0);
132
158template <typename Sentence1, typename Sentence2>
159double partial_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
160
161template <typename InputIt1, typename InputIt2>
162double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
163 double score_cutoff = 0);
164
165// todo add real implementation
166template <typename CharT1>
167struct CachedPartialRatio {
168 template <typename>
169 friend struct CachedWRatio;
170
171 template <typename InputIt1>
172 CachedPartialRatio(InputIt1 first1, InputIt1 last1);
173
174 template <typename Sentence1>
175 explicit CachedPartialRatio(const Sentence1& s1_)
176 : CachedPartialRatio(detail::to_begin(s1_), detail::to_end(s1_))
177 {}
178
179 template <typename InputIt2>
180 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
181 double score_hint = 0.0) const;
182
183 template <typename Sentence2>
184 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
185
186private:
187 std::vector<CharT1> s1;
188 rapidfuzz::detail::CharSet<CharT1> s1_char_set;
189 CachedRatio<CharT1> cached_ratio;
190};
191
192template <typename Sentence1>
193explicit CachedPartialRatio(const Sentence1& s1) -> CachedPartialRatio<char_type<Sentence1>>;
194
195template <typename InputIt1>
196CachedPartialRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialRatio<iter_value_t<InputIt1>>;
197
224template <typename Sentence1, typename Sentence2>
225double token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
226
227template <typename InputIt1, typename InputIt2>
228double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
229 double score_cutoff = 0);
230
231#ifdef RAPIDFUZZ_SIMD
232namespace experimental {
233template <int MaxLen>
234struct MultiTokenSortRatio {
235public:
236 MultiTokenSortRatio(size_t count) : scorer(count)
237 {}
238
239 size_t result_count() const
240 {
241 return scorer.result_count();
242 }
243
244 template <typename Sentence1>
245 void insert(const Sentence1& s1_)
246 {
247 insert(detail::to_begin(s1_), detail::to_end(s1_));
248 }
249
250 template <typename InputIt1>
251 void insert(InputIt1 first1, InputIt1 last1)
252 {
253 scorer.insert(detail::sorted_split(first1, last1).join());
254 }
255
256 template <typename InputIt2>
257 void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
258 double score_cutoff = 0.0) const
259 {
260 scorer.similarity(scores, score_count, detail::sorted_split(first2, last2).join(), score_cutoff);
261 }
262
263 template <typename Sentence2>
264 void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
265 {
266 similarity(scores, score_count, detail::to_begin(s2), detail::to_end(s2), score_cutoff);
267 }
268
269private:
270 MultiRatio<MaxLen> scorer;
271};
272} /* namespace experimental */
273#endif
274
275// todo CachedRatio speed for equal strings vs original implementation
276// TODO documentation
277template <typename CharT1>
278struct CachedTokenSortRatio {
279 template <typename InputIt1>
280 CachedTokenSortRatio(InputIt1 first1, InputIt1 last1)
281 : s1_sorted(detail::sorted_split(first1, last1).join()), cached_ratio(s1_sorted)
282 {}
283
284 template <typename Sentence1>
285 explicit CachedTokenSortRatio(const Sentence1& s1)
286 : CachedTokenSortRatio(detail::to_begin(s1), detail::to_end(s1))
287 {}
288
289 template <typename InputIt2>
290 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
291 double score_hint = 0.0) const;
292
293 template <typename Sentence2>
294 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
295
296private:
297 std::vector<CharT1> s1_sorted;
298 CachedRatio<CharT1> cached_ratio;
299};
300
301template <typename Sentence1>
302explicit CachedTokenSortRatio(const Sentence1& s1) -> CachedTokenSortRatio<char_type<Sentence1>>;
303
304template <typename InputIt1>
305CachedTokenSortRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSortRatio<iter_value_t<InputIt1>>;
306
327template <typename Sentence1, typename Sentence2>
328double partial_token_sort_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
329
330template <typename InputIt1, typename InputIt2>
331double partial_token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
332 double score_cutoff = 0);
333
334// TODO documentation
335template <typename CharT1>
336struct CachedPartialTokenSortRatio {
337 template <typename InputIt1>
338 CachedPartialTokenSortRatio(InputIt1 first1, InputIt1 last1)
339 : s1_sorted(detail::sorted_split(first1, last1).join()), cached_partial_ratio(s1_sorted)
340 {}
341
342 template <typename Sentence1>
343 explicit CachedPartialTokenSortRatio(const Sentence1& s1)
344 : CachedPartialTokenSortRatio(detail::to_begin(s1), detail::to_end(s1))
345 {}
346
347 template <typename InputIt2>
348 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
349 double score_hint = 0.0) const;
350
351 template <typename Sentence2>
352 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
353
354private:
355 std::vector<CharT1> s1_sorted;
356 CachedPartialRatio<CharT1> cached_partial_ratio;
357};
358
359template <typename Sentence1>
360explicit CachedPartialTokenSortRatio(const Sentence1& s1)
361 -> CachedPartialTokenSortRatio<char_type<Sentence1>>;
362
363template <typename InputIt1>
364CachedPartialTokenSortRatio(InputIt1 first1,
365 InputIt1 last1) -> CachedPartialTokenSortRatio<iter_value_t<InputIt1>>;
366
395template <typename Sentence1, typename Sentence2>
396double token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
397
398template <typename InputIt1, typename InputIt2>
399double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
400 double score_cutoff = 0);
401
402// TODO documentation
403template <typename CharT1>
404struct CachedTokenSetRatio {
405 template <typename InputIt1>
406 CachedTokenSetRatio(InputIt1 first1, InputIt1 last1)
407 : s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1)))
408 {}
409
410 template <typename Sentence1>
411 explicit CachedTokenSetRatio(const Sentence1& s1_)
412 : CachedTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_))
413 {}
414
415 template <typename InputIt2>
416 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
417 double score_hint = 0.0) const;
418
419 template <typename Sentence2>
420 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
421
422private:
423 std::vector<CharT1> s1;
424 detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
425};
426
427template <typename Sentence1>
428explicit CachedTokenSetRatio(const Sentence1& s1) -> CachedTokenSetRatio<char_type<Sentence1>>;
429
430template <typename InputIt1>
431CachedTokenSetRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenSetRatio<iter_value_t<InputIt1>>;
432
452template <typename Sentence1, typename Sentence2>
453double partial_token_set_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
454
455template <typename InputIt1, typename InputIt2>
456double partial_token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
457 double score_cutoff = 0);
458
459// TODO documentation
460template <typename CharT1>
461struct CachedPartialTokenSetRatio {
462 template <typename InputIt1>
463 CachedPartialTokenSetRatio(InputIt1 first1, InputIt1 last1)
464 : s1(first1, last1), tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1)))
465 {}
466
467 template <typename Sentence1>
468 explicit CachedPartialTokenSetRatio(const Sentence1& s1_)
469 : CachedPartialTokenSetRatio(detail::to_begin(s1_), detail::to_end(s1_))
470 {}
471
472 template <typename InputIt2>
473 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
474 double score_hint = 0.0) const;
475
476 template <typename Sentence2>
477 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
478
479private:
480 std::vector<CharT1> s1;
481 detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
482};
483
484template <typename Sentence1>
485explicit CachedPartialTokenSetRatio(const Sentence1& s1) -> CachedPartialTokenSetRatio<char_type<Sentence1>>;
486
487template <typename InputIt1>
488CachedPartialTokenSetRatio(InputIt1 first1,
489 InputIt1 last1) -> CachedPartialTokenSetRatio<iter_value_t<InputIt1>>;
490
510template <typename Sentence1, typename Sentence2>
511double token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
512
513template <typename InputIt1, typename InputIt2>
514double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
515
516// todo add real implementation
517template <typename CharT1>
518struct CachedTokenRatio {
519 template <typename InputIt1>
520 CachedTokenRatio(InputIt1 first1, InputIt1 last1)
521 : s1(first1, last1),
522 s1_tokens(detail::sorted_split(std::begin(s1), std::end(s1))),
523 s1_sorted(s1_tokens.join()),
524 cached_ratio_s1_sorted(s1_sorted)
525 {}
526
527 template <typename Sentence1>
528 explicit CachedTokenRatio(const Sentence1& s1_)
529 : CachedTokenRatio(detail::to_begin(s1_), detail::to_end(s1_))
530 {}
531
532 template <typename InputIt2>
533 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
534 double score_hint = 0.0) const;
535
536 template <typename Sentence2>
537 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
538
539private:
540 std::vector<CharT1> s1;
541 detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> s1_tokens;
542 std::vector<CharT1> s1_sorted;
543 CachedRatio<CharT1> cached_ratio_s1_sorted;
544};
545
546template <typename Sentence1>
547explicit CachedTokenRatio(const Sentence1& s1) -> CachedTokenRatio<char_type<Sentence1>>;
548
549template <typename InputIt1>
550CachedTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedTokenRatio<iter_value_t<InputIt1>>;
551
572template <typename Sentence1, typename Sentence2>
573double partial_token_ratio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
574
575template <typename InputIt1, typename InputIt2>
576double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
577 double score_cutoff = 0);
578
579// todo add real implementation
580template <typename CharT1>
581struct CachedPartialTokenRatio {
582 template <typename InputIt1>
583 CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1)
584 : s1(first1, last1),
585 tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))),
586 s1_sorted(tokens_s1.join())
587 {}
588
589 template <typename Sentence1>
590 explicit CachedPartialTokenRatio(const Sentence1& s1_)
591 : CachedPartialTokenRatio(detail::to_begin(s1_), detail::to_end(s1_))
592 {}
593
594 template <typename InputIt2>
595 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
596 double score_hint = 0.0) const;
597
598 template <typename Sentence2>
599 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
600
601private:
602 std::vector<CharT1> s1;
603 detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
604 std::vector<CharT1> s1_sorted;
605};
606
607template <typename Sentence1>
608explicit CachedPartialTokenRatio(const Sentence1& s1) -> CachedPartialTokenRatio<char_type<Sentence1>>;
609
610template <typename InputIt1>
611CachedPartialTokenRatio(InputIt1 first1, InputIt1 last1) -> CachedPartialTokenRatio<iter_value_t<InputIt1>>;
612
634template <typename Sentence1, typename Sentence2>
635double WRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
636
637template <typename InputIt1, typename InputIt2>
638double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
639
640// todo add real implementation
641template <typename CharT1>
642struct CachedWRatio {
643 template <typename InputIt1>
644 explicit CachedWRatio(InputIt1 first1, InputIt1 last1);
645
646 template <typename Sentence1>
647 CachedWRatio(const Sentence1& s1_) : CachedWRatio(detail::to_begin(s1_), detail::to_end(s1_))
648 {}
649
650 template <typename InputIt2>
651 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
652 double score_hint = 0.0) const;
653
654 template <typename Sentence2>
655 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
656
657private:
658 // todo somehow implement this using other ratios with creating PatternMatchVector
659 // multiple times
660 std::vector<CharT1> s1;
661 CachedPartialRatio<CharT1> cached_partial_ratio;
662 detail::SplittedSentenceView<typename std::vector<CharT1>::iterator> tokens_s1;
663 std::vector<CharT1> s1_sorted;
664 rapidfuzz::detail::BlockPatternMatchVector blockmap_s1_sorted;
665};
666
667template <typename Sentence1>
668explicit CachedWRatio(const Sentence1& s1) -> CachedWRatio<char_type<Sentence1>>;
669
670template <typename InputIt1>
671CachedWRatio(InputIt1 first1, InputIt1 last1) -> CachedWRatio<iter_value_t<InputIt1>>;
672
694template <typename Sentence1, typename Sentence2>
695double QRatio(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0);
696
697template <typename InputIt1, typename InputIt2>
698double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, double score_cutoff = 0);
699
700#ifdef RAPIDFUZZ_SIMD
701namespace experimental {
702template <int MaxLen>
703struct MultiQRatio {
704public:
705 MultiQRatio(size_t count) : scorer(count)
706 {}
707
708 size_t result_count() const
709 {
710 return scorer.result_count();
711 }
712
713 template <typename Sentence1>
714 void insert(const Sentence1& s1_)
715 {
716 insert(detail::to_begin(s1_), detail::to_end(s1_));
717 }
718
719 template <typename InputIt1>
720 void insert(InputIt1 first1, InputIt1 last1)
721 {
722 scorer.insert(first1, last1);
723 str_lens.push_back(static_cast<size_t>(std::distance(first1, last1)));
724 }
725
726 template <typename InputIt2>
727 void similarity(double* scores, size_t score_count, InputIt2 first2, InputIt2 last2,
728 double score_cutoff = 0.0) const
729 {
730 similarity(scores, score_count, detail::Range(first2, last2), score_cutoff);
731 }
732
733 template <typename Sentence2>
734 void similarity(double* scores, size_t score_count, const Sentence2& s2, double score_cutoff = 0) const
735 {
736 rapidfuzz::detail::Range s2_(s2);
737 if (s2_.empty()) {
738 for (size_t i = 0; i < str_lens.size(); ++i)
739 scores[i] = 0;
740
741 return;
742 }
743
744 scorer.similarity(scores, score_count, s2, score_cutoff);
745
746 for (size_t i = 0; i < str_lens.size(); ++i)
747 if (str_lens[i] == 0) scores[i] = 0;
748 }
749
750private:
751 std::vector<size_t> str_lens;
752 MultiRatio<MaxLen> scorer;
753};
754} /* namespace experimental */
755#endif
756
757template <typename CharT1>
758struct CachedQRatio {
759 template <typename InputIt1>
760 CachedQRatio(InputIt1 first1, InputIt1 last1) : s1(first1, last1), cached_ratio(first1, last1)
761 {}
762
763 template <typename Sentence1>
764 explicit CachedQRatio(const Sentence1& s1_) : CachedQRatio(detail::to_begin(s1_), detail::to_end(s1_))
765 {}
766
767 template <typename InputIt2>
768 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0.0,
769 double score_hint = 0.0) const;
770
771 template <typename Sentence2>
772 double similarity(const Sentence2& s2, double score_cutoff = 0.0, double score_hint = 0.0) const;
773
774private:
775 std::vector<CharT1> s1;
776 CachedRatio<CharT1> cached_ratio;
777};
778
779template <typename Sentence1>
780explicit CachedQRatio(const Sentence1& s1) -> CachedQRatio<char_type<Sentence1>>;
781
782template <typename InputIt1>
783CachedQRatio(InputIt1 first1, InputIt1 last1) -> CachedQRatio<iter_value_t<InputIt1>>;
784
787} // namespace rapidfuzz::fuzz
788
789#include <rapidfuzz/fuzz_impl.hpp>
double WRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a weighted ratio based on the other ratio algorithms.
Definition: fuzz_impl.hpp:826
double ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates a simple ratio between two strings
Definition: fuzz_impl.hpp:27
double QRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a quick ratio between two strings using fuzz.ratio.
Definition: fuzz_impl.hpp:908
double partial_token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::partial_r...
Definition: fuzz_impl.hpp:490
double token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::token_set_ratio and fuzz::token_sort_ratio (faster th...
Definition: fuzz_impl.hpp:567
double partial_token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::partial_token_set_ratio and fuzz::partial_token_sort_...
Definition: fuzz_impl.hpp:733
double token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::ratio.
Definition: fuzz_impl.hpp:431
double partial_token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::partial_ratio between them.
Definition: fuzz_impl.hpp:342
double token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::ratio between them.
Definition: fuzz_impl.hpp:303
double partial_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates the fuzz::ratio of the optimal string alignment
Definition: fuzz_impl.hpp:240