5#include "rapidfuzz/details/Range.hpp"
7#include <rapidfuzz/details/CharSet.hpp>
22template <
typename InputIt1,
typename InputIt2>
23double ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
25 return ratio(detail::make_range(first1, last1), detail::make_range(first2, last2), score_cutoff);
28template <
typename Sentence1,
typename Sentence2>
29double ratio(
const Sentence1& s1,
const Sentence2& s2,
const double score_cutoff)
31 return indel_normalized_similarity(s1, s2, score_cutoff / 100) * 100;
34template <
typename CharT1>
35template <
typename InputIt2>
36double CachedRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
37 double score_hint)
const
39 return similarity(detail::make_range(first2, last2), score_cutoff, score_hint);
42template <
typename CharT1>
43template <
typename Sentence2>
44double CachedRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double score_hint)
const
46 return cached_indel.normalized_similarity(s2, score_cutoff / 100, score_hint / 100) * 100;
53namespace fuzz_detail {
55static RAPIDFUZZ_CONSTEXPR_CXX14
double norm_distance(
size_t dist,
size_t lensum,
double score_cutoff = 0)
58 (lensum > 0) ? (100.0 - 100.0 *
static_cast<double>(dist) /
static_cast<double>(lensum)) : 100.0;
60 return (score >= score_cutoff) ? score : 0;
63static inline size_t score_cutoff_to_distance(
double score_cutoff,
size_t lensum)
65 return static_cast<size_t>(std::ceil(
static_cast<double>(lensum) * (1.0 - score_cutoff / 100)));
68template <
typename InputIt1,
typename InputIt2,
typename CachedCharT1>
70partial_ratio_impl(
const detail::Range<InputIt1>& s1,
const detail::Range<InputIt2>& s2,
71 const CachedRatio<CachedCharT1>& cached_ratio,
72 const detail::CharSet<iter_value_t<InputIt1>>& s1_char_set,
double score_cutoff)
74 ScoreAlignment<double> res;
75 size_t len1 = s1.size();
76 size_t len2 = s2.size();
83 size_t maximum = len1 * 2;
84 double norm_cutoff_sim = rapidfuzz::detail::NormSim_to_NormDist(score_cutoff / 100);
85 size_t cutoff_dist =
static_cast<size_t>(std::ceil(
static_cast<double>(maximum) * norm_cutoff_sim));
86 size_t best_dist = std::numeric_limits<size_t>::max();
87 std::vector<size_t> scores(len2 - len1, std::numeric_limits<size_t>::max());
88 std::vector<std::pair<size_t, size_t>> windows = {{0, len2 - len1 - 1}};
89 std::vector<std::pair<size_t, size_t>> new_windows;
91 while (!windows.empty()) {
92 for (
const auto& window : windows) {
93 auto subseq1_first = s2.begin() +
static_cast<ptrdiff_t
>(window.first);
94 auto subseq2_first = s2.begin() +
static_cast<ptrdiff_t
>(window.second);
96 detail::make_range(subseq1_first, subseq1_first +
static_cast<ptrdiff_t
>(len1));
98 detail::make_range(subseq2_first, subseq2_first +
static_cast<ptrdiff_t
>(len1));
100 if (scores[window.first] == std::numeric_limits<size_t>::max()) {
101 scores[window.first] = cached_ratio.cached_indel.distance(subseq1);
102 if (scores[window.first] < cutoff_dist) {
103 cutoff_dist = best_dist = scores[window.first];
104 res.dest_start = window.first;
105 res.dest_end = window.first + len1;
106 if (best_dist == 0) {
112 if (scores[window.second] == std::numeric_limits<size_t>::max()) {
113 scores[window.second] = cached_ratio.cached_indel.distance(subseq2);
114 if (scores[window.second] < cutoff_dist) {
115 cutoff_dist = best_dist = scores[window.second];
116 res.dest_start = window.second;
117 res.dest_end = window.second + len1;
118 if (best_dist == 0) {
125 size_t cell_diff = window.second - window.first;
126 if (cell_diff == 1)
continue;
129 size_t known_edits = detail::abs_diff(scores[window.first], scores[window.second]);
131 size_t max_score_improvement = (cell_diff - known_edits / 2) / 2 * 2;
132 ptrdiff_t min_score =
133 static_cast<ptrdiff_t
>(std::min(scores[window.first], scores[window.second])) -
134 static_cast<ptrdiff_t
>(max_score_improvement);
135 if (min_score <
static_cast<ptrdiff_t
>(cutoff_dist)) {
136 size_t center = cell_diff / 2;
137 new_windows.emplace_back(window.first, window.first + center);
138 new_windows.emplace_back(window.first + center, window.second);
142 std::swap(windows, new_windows);
146 double score = 1.0 - (
static_cast<double>(best_dist) /
static_cast<double>(maximum));
148 if (score >= score_cutoff) score_cutoff = res.score = score;
151 for (
size_t i = 1; i < len1; ++i) {
152 auto subseq = rapidfuzz::detail::make_range(s2.begin(), s2.begin() +
static_cast<ptrdiff_t
>(i));
153 if (!s1_char_set.find(subseq.back()))
continue;
155 double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
156 if (ls_ratio > res.score) {
157 score_cutoff = res.score = ls_ratio;
160 if (res.score == 100.0)
return res;
164 for (
size_t i = len2 - len1; i < len2; ++i) {
165 auto subseq = rapidfuzz::detail::make_range(s2.begin() +
static_cast<ptrdiff_t
>(i), s2.end());
166 if (!s1_char_set.find(subseq.front()))
continue;
168 double ls_ratio = cached_ratio.similarity(subseq, score_cutoff);
169 if (ls_ratio > res.score) {
170 score_cutoff = res.score = ls_ratio;
173 if (res.score == 100.0)
return res;
180template <
typename InputIt1,
typename InputIt2,
typename CharT1 = iter_value_t<InputIt1>>
181ScoreAlignment<double> partial_ratio_impl(
const detail::Range<InputIt1>& s1,
182 const detail::Range<InputIt2>& s2,
double score_cutoff)
184 CachedRatio<CharT1> cached_ratio(s1);
186 detail::CharSet<CharT1> s1_char_set;
188 s1_char_set.insert(ch);
190 return partial_ratio_impl(s1, s2, cached_ratio, s1_char_set, score_cutoff);
195template <
typename InputIt1,
typename InputIt2>
196ScoreAlignment<double> partial_ratio_alignment(InputIt1 first1, InputIt1 last1, InputIt2 first2,
197 InputIt2 last2,
double score_cutoff)
199 size_t len1 =
static_cast<size_t>(std::distance(first1, last1));
200 size_t len2 =
static_cast<size_t>(std::distance(first2, last2));
203 ScoreAlignment<double> result = partial_ratio_alignment(first2, last2, first1, last1, score_cutoff);
204 std::swap(result.src_start, result.dest_start);
205 std::swap(result.src_end, result.dest_end);
209 if (score_cutoff > 100)
return ScoreAlignment<double>(0, 0, len1, 0, len1);
212 return ScoreAlignment<double>(
static_cast<double>(len1 == len2) * 100.0, 0, len1, 0, len1);
214 auto s1 = detail::make_range(first1, last1);
215 auto s2 = detail::make_range(first2, last2);
217 auto alignment = fuzz_detail::partial_ratio_impl(s1, s2, score_cutoff);
218 if (alignment.score != 100 && s1.size() == s2.size()) {
219 score_cutoff = std::max(score_cutoff, alignment.score);
220 auto alignment2 = fuzz_detail::partial_ratio_impl(s2, s1, score_cutoff);
221 if (alignment2.score > alignment.score) {
222 std::swap(alignment2.src_start, alignment2.dest_start);
223 std::swap(alignment2.src_end, alignment2.dest_end);
231template <
typename Sentence1,
typename Sentence2>
232ScoreAlignment<double> partial_ratio_alignment(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
234 return partial_ratio_alignment(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
235 detail::to_end(s2), score_cutoff);
238template <
typename InputIt1,
typename InputIt2>
239double partial_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
241 return partial_ratio_alignment(first1, last1, first2, last2, score_cutoff).score;
244template <
typename Sentence1,
typename Sentence2>
245double partial_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
247 return partial_ratio_alignment(s1, s2, score_cutoff).score;
250template <
typename CharT1>
251template <
typename InputIt1>
252CachedPartialRatio<CharT1>::CachedPartialRatio(InputIt1 first1, InputIt1 last1)
253 : s1(first1, last1), cached_ratio(first1, last1)
255 for (
const auto& ch : s1)
256 s1_char_set.insert(ch);
259template <
typename CharT1>
260template <
typename InputIt2>
261double CachedPartialRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
264 size_t len1 = s1.size();
265 size_t len2 =
static_cast<size_t>(std::distance(first2, last2));
268 return partial_ratio(detail::to_begin(s1), detail::to_end(s1), first2, last2, score_cutoff);
270 if (score_cutoff > 100)
return 0;
272 if (!len1 || !len2)
return static_cast<double>(len1 == len2) * 100.0;
274 auto s1_ = detail::make_range(s1);
275 auto s2 = detail::make_range(first2, last2);
277 double score = fuzz_detail::partial_ratio_impl(s1_, s2, cached_ratio, s1_char_set, score_cutoff).score;
278 if (score != 100 && s1_.size() == s2.size()) {
279 score_cutoff = std::max(score_cutoff, score);
280 double score2 = fuzz_detail::partial_ratio_impl(s2, s1_, score_cutoff).score;
281 if (score2 > score)
return score2;
287template <
typename CharT1>
288template <
typename Sentence2>
289double CachedPartialRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double)
const
291 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
297template <
typename InputIt1,
typename InputIt2>
298double token_sort_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
300 if (score_cutoff > 100)
return 0;
302 return ratio(detail::sorted_split(first1, last1).join(), detail::sorted_split(first2, last2).join(),
306template <
typename Sentence1,
typename Sentence2>
309 return token_sort_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2),
310 detail::to_end(s2), score_cutoff);
313template <
typename CharT1>
314template <
typename InputIt2>
315double CachedTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
318 if (score_cutoff > 100)
return 0;
320 return cached_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
323template <
typename CharT1>
324template <
typename Sentence2>
325double CachedTokenSortRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double)
const
327 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
334template <
typename InputIt1,
typename InputIt2>
338 if (score_cutoff > 100)
return 0;
340 return partial_ratio(detail::sorted_split(first1, last1).join(),
341 detail::sorted_split(first2, last2).join(), score_cutoff);
344template <
typename Sentence1,
typename Sentence2>
348 detail::to_end(s2), score_cutoff);
351template <
typename CharT1>
352template <
typename InputIt2>
353double CachedPartialTokenSortRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
356 if (score_cutoff > 100)
return 0;
358 return cached_partial_ratio.similarity(detail::sorted_split(first2, last2).join(), score_cutoff);
361template <
typename CharT1>
362template <
typename Sentence2>
363double CachedPartialTokenSortRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double)
const
365 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
372namespace fuzz_detail {
373template <
typename InputIt1,
typename InputIt2>
374double token_set_ratio(
const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_a,
375 const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
376 const double score_cutoff)
380 if (tokens_a.empty() || tokens_b.empty())
return 0;
382 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
383 auto intersect = decomposition.intersection;
384 auto diff_ab = decomposition.difference_ab;
385 auto diff_ba = decomposition.difference_ba;
388 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty()))
return 100;
390 auto diff_ab_joined = diff_ab.join();
391 auto diff_ba_joined = diff_ba.join();
393 size_t ab_len = diff_ab_joined.size();
394 size_t ba_len = diff_ba_joined.size();
395 size_t sect_len = intersect.length();
398 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
399 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
402 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
403 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
405 if (dist <= cutoff_distance) result = norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff);
408 if (!sect_len)
return result;
413 size_t sect_ab_dist = bool(sect_len) + ab_len;
414 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
416 size_t sect_ba_dist = bool(sect_len) + ba_len;
417 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
419 return std::max({result, sect_ab_ratio, sect_ba_ratio});
423template <
typename InputIt1,
typename InputIt2>
424double token_set_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
426 if (score_cutoff > 100)
return 0;
428 return fuzz_detail::token_set_ratio(detail::sorted_split(first1, last1),
429 detail::sorted_split(first2, last2), score_cutoff);
432template <
typename Sentence1,
typename Sentence2>
435 return token_set_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
439template <
typename CharT1>
440template <
typename InputIt2>
441double CachedTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
444 if (score_cutoff > 100)
return 0;
446 return fuzz_detail::token_set_ratio(tokens_s1, detail::sorted_split(first2, last2), score_cutoff);
449template <
typename CharT1>
450template <
typename Sentence2>
451double CachedTokenSetRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double)
const
453 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
460namespace fuzz_detail {
461template <
typename InputIt1,
typename InputIt2>
463 const rapidfuzz::detail::SplittedSentenceView<InputIt2>& tokens_b,
464 const double score_cutoff)
468 if (tokens_a.empty() || tokens_b.empty())
return 0;
470 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
473 if (!decomposition.intersection.empty())
return 100;
475 return partial_ratio(decomposition.difference_ab.join(), decomposition.difference_ba.join(),
480template <
typename InputIt1,
typename InputIt2>
484 if (score_cutoff > 100)
return 0;
486 return fuzz_detail::partial_token_set_ratio(detail::sorted_split(first1, last1),
487 detail::sorted_split(first2, last2), score_cutoff);
490template <
typename Sentence1,
typename Sentence2>
494 detail::to_end(s2), score_cutoff);
497template <
typename CharT1>
498template <
typename InputIt2>
499double CachedPartialTokenSetRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
502 if (score_cutoff > 100)
return 0;
504 return fuzz_detail::partial_token_set_ratio(tokens_s1, detail::sorted_split(first2, last2), score_cutoff);
507template <
typename CharT1>
508template <
typename Sentence2>
509double CachedPartialTokenSetRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double)
const
511 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
518template <
typename InputIt1,
typename InputIt2>
519double token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
521 if (score_cutoff > 100)
return 0;
523 auto tokens_a = detail::sorted_split(first1, last1);
524 auto tokens_b = detail::sorted_split(first2, last2);
526 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
527 auto intersect = decomposition.intersection;
528 auto diff_ab = decomposition.difference_ab;
529 auto diff_ba = decomposition.difference_ba;
531 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty()))
return 100;
533 auto diff_ab_joined = diff_ab.join();
534 auto diff_ba_joined = diff_ba.join();
536 size_t ab_len = diff_ab_joined.size();
537 size_t ba_len = diff_ba_joined.size();
538 size_t sect_len = intersect.length();
540 double result =
ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
543 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
544 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
546 size_t cutoff_distance = fuzz_detail::score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
547 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
548 if (dist <= cutoff_distance)
549 result = std::max(result, fuzz_detail::norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
552 if (!sect_len)
return result;
557 size_t sect_ab_dist = bool(sect_len) + ab_len;
558 double sect_ab_ratio = fuzz_detail::norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
560 size_t sect_ba_dist = bool(sect_len) + ba_len;
561 double sect_ba_ratio = fuzz_detail::norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
563 return std::max({result, sect_ab_ratio, sect_ba_ratio});
566template <
typename Sentence1,
typename Sentence2>
567double token_ratio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
569 return token_ratio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
573namespace fuzz_detail {
574template <
typename CharT1,
typename CachedCharT1,
typename InputIt2>
575double token_ratio(
const rapidfuzz::detail::SplittedSentenceView<CharT1>& s1_tokens,
576 const CachedRatio<CachedCharT1>& cached_ratio_s1_sorted, InputIt2 first2, InputIt2 last2,
579 if (score_cutoff > 100)
return 0;
581 auto s2_tokens = detail::sorted_split(first2, last2);
583 auto decomposition = detail::set_decomposition(s1_tokens, s2_tokens);
584 auto intersect = decomposition.intersection;
585 auto diff_ab = decomposition.difference_ab;
586 auto diff_ba = decomposition.difference_ba;
588 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty()))
return 100;
590 auto diff_ab_joined = diff_ab.join();
591 auto diff_ba_joined = diff_ba.join();
593 size_t ab_len = diff_ab_joined.size();
594 size_t ba_len = diff_ba_joined.size();
595 size_t sect_len = intersect.length();
597 double result = cached_ratio_s1_sorted.similarity(s2_tokens.join(), score_cutoff);
600 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
601 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
603 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
604 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
605 if (dist <= cutoff_distance)
606 result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
609 if (!sect_len)
return result;
614 size_t sect_ab_dist = bool(sect_len) + ab_len;
615 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
617 size_t sect_ba_dist = bool(sect_len) + ba_len;
618 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
620 return std::max({result, sect_ab_ratio, sect_ba_ratio});
624template <
typename CharT1,
typename InputIt1,
typename InputIt2>
625double token_ratio(
const std::vector<CharT1>& s1_sorted,
626 const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
627 const detail::BlockPatternMatchVector& blockmap_s1_sorted, InputIt2 first2, InputIt2 last2,
630 if (score_cutoff > 100)
return 0;
632 auto tokens_b = detail::sorted_split(first2, last2);
634 auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
635 auto intersect = decomposition.intersection;
636 auto diff_ab = decomposition.difference_ab;
637 auto diff_ba = decomposition.difference_ba;
639 if (!intersect.empty() && (diff_ab.empty() || diff_ba.empty()))
return 100;
641 auto diff_ab_joined = diff_ab.join();
642 auto diff_ba_joined = diff_ba.join();
644 size_t ab_len = diff_ab_joined.size();
645 size_t ba_len = diff_ba_joined.size();
646 size_t sect_len = intersect.length();
649 auto s2_sorted = tokens_b.join();
650 if (s1_sorted.size() < 65) {
652 detail::indel_normalized_similarity(blockmap_s1_sorted, detail::make_range(s1_sorted),
653 detail::make_range(s2_sorted), score_cutoff / 100);
654 result = norm_sim * 100;
657 result = fuzz::ratio(s1_sorted, s2_sorted, score_cutoff);
661 size_t sect_ab_len = sect_len + bool(sect_len) + ab_len;
662 size_t sect_ba_len = sect_len + bool(sect_len) + ba_len;
664 size_t cutoff_distance = score_cutoff_to_distance(score_cutoff, sect_ab_len + sect_ba_len);
665 size_t dist = indel_distance(diff_ab_joined, diff_ba_joined, cutoff_distance);
666 if (dist <= cutoff_distance)
667 result = std::max(result, norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff));
670 if (!sect_len)
return result;
675 size_t sect_ab_dist = bool(sect_len) + ab_len;
676 double sect_ab_ratio = norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff);
678 size_t sect_ba_dist = bool(sect_len) + ba_len;
679 double sect_ba_ratio = norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff);
681 return std::max({result, sect_ab_ratio, sect_ba_ratio});
685template <
typename CharT1>
686template <
typename InputIt2>
687double CachedTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
690 return fuzz_detail::token_ratio(s1_tokens, cached_ratio_s1_sorted, first2, last2, score_cutoff);
693template <
typename CharT1>
694template <
typename Sentence2>
695double CachedTokenRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double)
const
697 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
704template <
typename InputIt1,
typename InputIt2>
705double partial_token_ratio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
708 if (score_cutoff > 100)
return 0;
710 auto tokens_a = detail::sorted_split(first1, last1);
711 auto tokens_b = detail::sorted_split(first2, last2);
713 auto decomposition = detail::set_decomposition(tokens_a, tokens_b);
716 if (!decomposition.intersection.empty())
return 100;
718 auto diff_ab = decomposition.difference_ab;
719 auto diff_ba = decomposition.difference_ba;
721 double result =
partial_ratio(tokens_a.join(), tokens_b.join(), score_cutoff);
724 if (tokens_a.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
728 score_cutoff = std::max(score_cutoff, result);
729 return std::max(result,
partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
732template <
typename Sentence1,
typename Sentence2>
736 detail::to_end(s2), score_cutoff);
739namespace fuzz_detail {
740template <
typename CharT1,
typename InputIt1,
typename InputIt2>
741double partial_token_ratio(
const std::vector<CharT1>& s1_sorted,
742 const rapidfuzz::detail::SplittedSentenceView<InputIt1>& tokens_s1,
743 InputIt2 first2, InputIt2 last2,
double score_cutoff)
745 if (score_cutoff > 100)
return 0;
747 auto tokens_b = detail::sorted_split(first2, last2);
749 auto decomposition = detail::set_decomposition(tokens_s1, tokens_b);
752 if (!decomposition.intersection.empty())
return 100;
754 auto diff_ab = decomposition.difference_ab;
755 auto diff_ba = decomposition.difference_ba;
757 double result = partial_ratio(s1_sorted, tokens_b.join(), score_cutoff);
760 if (tokens_s1.word_count() == diff_ab.word_count() && tokens_b.word_count() == diff_ba.word_count()) {
764 score_cutoff = std::max(score_cutoff, result);
765 return std::max(result, partial_ratio(diff_ab.join(), diff_ba.join(), score_cutoff));
770template <
typename CharT1>
771template <
typename InputIt2>
772double CachedPartialTokenRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
775 return fuzz_detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
778template <
typename CharT1>
779template <
typename Sentence2>
780double CachedPartialTokenRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double)
const
782 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
789template <
typename InputIt1,
typename InputIt2>
790double WRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
792 if (score_cutoff > 100)
return 0;
794 constexpr double UNBASE_SCALE = 0.95;
796 auto len1 = std::distance(first1, last1);
797 auto len2 = std::distance(first2, last2);
801 if (!len1 || !len2)
return 0;
803 double len_ratio = (len1 > len2) ?
static_cast<double>(len1) /
static_cast<double>(len2)
804 : static_cast<double>(len2) / static_cast<double>(len1);
806 double end_ratio =
ratio(first1, last1, first2, last2, score_cutoff);
808 if (len_ratio < 1.5) {
809 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
810 return std::max(end_ratio,
token_ratio(first1, last1, first2, last2, score_cutoff) * UNBASE_SCALE);
813 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
815 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
817 std::max(end_ratio,
partial_ratio(first1, last1, first2, last2, score_cutoff) * PARTIAL_SCALE);
819 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
820 return std::max(end_ratio,
partial_token_ratio(first1, last1, first2, last2, score_cutoff) *
821 UNBASE_SCALE * PARTIAL_SCALE);
824template <
typename Sentence1,
typename Sentence2>
825double WRatio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
827 return WRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
831template <
typename Sentence1>
832template <
typename InputIt1>
833CachedWRatio<Sentence1>::CachedWRatio(InputIt1 first1, InputIt1 last1)
835 cached_partial_ratio(first1, last1),
836 tokens_s1(detail::sorted_split(std::begin(s1), std::end(s1))),
837 s1_sorted(tokens_s1.join()),
838 blockmap_s1_sorted(detail::make_range(s1_sorted))
841template <
typename CharT1>
842template <
typename InputIt2>
843double CachedWRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
double)
const
845 if (score_cutoff > 100)
return 0;
847 constexpr double UNBASE_SCALE = 0.95;
849 size_t len1 = s1.size();
850 size_t len2 =
static_cast<size_t>(std::distance(first2, last2));
854 if (!len1 || !len2)
return 0;
856 double len_ratio = (len1 > len2) ?
static_cast<double>(len1) /
static_cast<double>(len2)
857 : static_cast<double>(len2) / static_cast<double>(len1);
859 double end_ratio = cached_partial_ratio.cached_ratio.similarity(first2, last2, score_cutoff);
861 if (len_ratio < 1.5) {
862 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
865 fuzz_detail::token_ratio(s1_sorted, tokens_s1, blockmap_s1_sorted, first2, last2, score_cutoff);
866 return std::max(end_ratio, r * UNBASE_SCALE);
869 const double PARTIAL_SCALE = (len_ratio < 8.0) ? 0.9 : 0.6;
871 score_cutoff = std::max(score_cutoff, end_ratio) / PARTIAL_SCALE;
873 std::max(end_ratio, cached_partial_ratio.similarity(first2, last2, score_cutoff) * PARTIAL_SCALE);
875 score_cutoff = std::max(score_cutoff, end_ratio) / UNBASE_SCALE;
876 auto r = fuzz_detail::partial_token_ratio(s1_sorted, tokens_s1, first2, last2, score_cutoff);
877 return std::max(end_ratio, r * UNBASE_SCALE * PARTIAL_SCALE);
880template <
typename CharT1>
881template <
typename Sentence2>
882double CachedWRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double)
const
884 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
891template <
typename InputIt1,
typename InputIt2>
892double QRatio(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
double score_cutoff)
894 ptrdiff_t len1 = std::distance(first1, last1);
895 ptrdiff_t len2 = std::distance(first2, last2);
899 if (!len1 || !len2)
return 0;
901 return ratio(first1, last1, first2, last2, score_cutoff);
904template <
typename Sentence1,
typename Sentence2>
905double QRatio(
const Sentence1& s1,
const Sentence2& s2,
double score_cutoff)
907 return QRatio(detail::to_begin(s1), detail::to_end(s1), detail::to_begin(s2), detail::to_end(s2),
911template <
typename CharT1>
912template <
typename InputIt2>
913double CachedQRatio<CharT1>::similarity(InputIt2 first2, InputIt2 last2,
double score_cutoff,
double)
const
915 auto len2 = std::distance(first2, last2);
919 if (s1.empty() || !len2)
return 0;
921 return cached_ratio.similarity(first2, last2, score_cutoff);
924template <
typename CharT1>
925template <
typename Sentence2>
926double CachedQRatio<CharT1>::similarity(
const Sentence2& s2,
double score_cutoff,
double)
const
928 return similarity(detail::to_begin(s2), detail::to_end(s2), score_cutoff);
double WRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a weighted ratio based on the other ratio algorithms.
Definition fuzz_impl.hpp:825
double ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates a simple ratio between two strings
Definition fuzz_impl.hpp:29
double QRatio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Calculates a quick ratio between two strings using fuzz.ratio.
Definition fuzz_impl.hpp:905
double partial_token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::partial_r...
Definition fuzz_impl.hpp:491
double token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::token_set_ratio and fuzz::token_sort_ratio (faster th...
Definition fuzz_impl.hpp:567
double partial_token_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Helper method that returns the maximum of fuzz::partial_token_set_ratio and fuzz::partial_token_sort_...
Definition fuzz_impl.hpp:733
double token_set_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Compares the words in the strings based on unique and common words between them using fuzz::ratio.
Definition fuzz_impl.hpp:433
double partial_token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::partial_ratio between them.
Definition fuzz_impl.hpp:345
double token_sort_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
Sorts the words in the strings and calculates the fuzz::ratio between them.
Definition fuzz_impl.hpp:307
double partial_ratio(const Sentence1 &s1, const Sentence2 &s2, double score_cutoff=0)
calculates the fuzz::ratio of the optimal string alignment
Definition fuzz_impl.hpp:245