Skip to content

Commit 46720b3

Browse files
committed
Compare by codepoints
1 parent 2d7420f commit 46720b3

File tree

3 files changed

+96
-55
lines changed

3 files changed

+96
-55
lines changed

spec/ruby/core/string/comparison_spec.rb

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,18 @@
6060
("ÄÖÜ" <=> "ÄÖÛ").should == 1
6161
end
6262

63-
it "ignores encoding difference" do
64-
("ÄÖÛ".dup.force_encoding("utf-8") <=> "ÄÖÜ".dup.force_encoding("iso-8859-1")).should == -1
65-
("ÄÖÜ".dup.force_encoding("utf-8") <=> "ÄÖÛ".dup.force_encoding("iso-8859-1")).should == 1
63+
ruby_version_is ''...'3.4' do
64+
it "ignores encoding difference" do
65+
("ÄÖÛ".dup.force_encoding("utf-8") <=> "ÄÖÜ".dup.force_encoding("iso-8859-1")).should == -1
66+
("ÄÖÜ".dup.force_encoding("utf-8") <=> "ÄÖÛ".dup.force_encoding("iso-8859-1")).should == 1
67+
end
68+
end
69+
70+
ruby_version_is '3.4' do
71+
it "ignores encoding difference" do
72+
("ÄÖÛ" <=> "ÄÖÜ".encode("iso-8859-1")).should == -1
73+
("ÄÖÜ" <=> "ÄÖÛ".encode("iso-8859-1")).should == 1
74+
end
6675
end
6776

6877
it "returns 0 with identical ASCII-compatible bytes of different encodings" do

string.c

Lines changed: 83 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4074,35 +4074,57 @@ rb_str_cmp(VALUE str1, VALUE str2)
40744074
RSTRING_GETMEM(str1, ptr1, len1);
40754075
RSTRING_GETMEM(str2, ptr2, len2);
40764076

4077-
rb_encoding *enc1 = rb_enc_get(str1);
4078-
rb_encoding *enc2 = rb_enc_get(str2);
4079-
if (rb_enc_mbminlen(enc1) > 1 && rb_enc_mbminlen(enc2) > 1 &&
4080-
rb_str_comparable(str1, str2)) {
4081-
/* wchar-base encoding */
4082-
const char *end1 = ptr1 + len1, *end2 = ptr2 + len2;
4083-
unsigned int c1, c2;
4084-
int r1, r2;
4085-
4086-
while (len1 > 0 && len2 > 0) {
4087-
if (!MBCLEN_CHARFOUND_P(r1 = rb_enc_precise_mbclen(ptr1, end1, enc1)))
4088-
break;
4089-
if (!MBCLEN_CHARFOUND_P(r2 = rb_enc_precise_mbclen(ptr2, end2, enc2)))
4090-
break;
4091-
c1 = rb_enc_mbc_to_codepoint(ptr1, end1, enc1);
4092-
c2 = rb_enc_mbc_to_codepoint(ptr2, end2, enc2);
4093-
len1 = end1 - (ptr1 += MBCLEN_CHARFOUND_LEN(r1));
4094-
len2 = end2 - (ptr2 += MBCLEN_CHARFOUND_LEN(r2));
4095-
if (c1 != c2) {
4096-
return c1 < c2 ? -1 : 1;
4097-
}
4098-
}
4099-
if (len1 == 0 && len2 == 0) return 0;
4100-
if (len1 == 0) return -1;
4101-
if (len2 == 0) return 1;
4077+
if (ptr1 == ptr2) {
4078+
long len = lesser(len1, len2);
4079+
ptr1 += len; len1 -= len;
4080+
ptr2 += len; len2 -= len;
4081+
}
4082+
if (len1 == 0 && len2 == 0) return 0;
4083+
if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4084+
if ((retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4085+
if (len1 == len2) {
4086+
if (!rb_str_comparable(str1, str2)) {
4087+
if (ENCODING_GET(str1) > ENCODING_GET(str2))
4088+
return 1;
4089+
return -1;
4090+
}
4091+
return 0;
4092+
}
4093+
return (len1 > len2) ? 1 : -1;
4094+
}
4095+
return (retval > 0) ? 1 : -1;
4096+
}
4097+
else {
4098+
rb_encoding *enc1 = rb_enc_get(str1);
4099+
rb_encoding *enc2 = rb_enc_get(str2);
4100+
const char *p1end = ptr1 + len1, *p2end = ptr2 + len2;
4101+
const char *p1 = ptr1, *p2 = ptr2;
4102+
unsigned int c1, c2;
4103+
int r1, r2;
4104+
4105+
while (p1 < p1end && p2 < p2end) {
4106+
if ((r1 = rb_enc_precise_mbclen(p1, p1end, enc1)) <= 0)
4107+
break;
4108+
if ((r2 = rb_enc_precise_mbclen(p2, p2end, enc2)) <= 0)
4109+
break;
4110+
c1 = rb_enc_mbc_to_codepoint(p1, p1end, enc1);
4111+
c2 = rb_enc_mbc_to_codepoint(p2, p2end, enc2);
4112+
p1 += MBCLEN_CHARFOUND_LEN(r1);
4113+
p2 += MBCLEN_CHARFOUND_LEN(r2);
4114+
if (c1 != c2) {
4115+
return c1 < c2 ? -1 : 1;
4116+
}
4117+
}
4118+
len1 = p1end - (ptr1 = p1);
4119+
len2 = p2end - (ptr2 = p2);
4120+
if (len1 == 0 && len2 == 0) goto same_binary;
4121+
if (len1 == 0) return -1;
4122+
if (len2 == 0) return 1;
41024123
}
41034124

41044125
if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
41054126
if (len1 == len2) {
4127+
same_binary:
41064128
if (!rb_str_comparable(str1, str2)) {
41074129
if (ENCODING_GET(str1) > ENCODING_GET(str2))
41084130
return 1;
@@ -4263,6 +4285,12 @@ str_casecmp(VALUE str1, VALUE str2)
42634285

42644286
p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
42654287
p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4288+
if (p1 == p2) {
4289+
len = lesser(p1end-p1, p2end-p2);
4290+
p1 += len;
4291+
p2 += len;
4292+
}
4293+
if (p1 == p1end && p2 == p2end) return INT2FIX(0);
42664294
if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
42674295
while (p1 < p1end && p2 < p2end) {
42684296
if (*p1 != *p2) {
@@ -4276,34 +4304,38 @@ str_casecmp(VALUE str1, VALUE str2)
42764304
}
42774305
}
42784306
else {
4279-
while (p1 < p1end && p2 < p2end) {
4280-
int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4281-
int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4307+
rb_encoding *enc1 = rb_enc_get(str1);
4308+
rb_encoding *enc2 = rb_enc_get(str2);
4309+
unsigned int c1, c2;
4310+
int r1, r2;
42824311

4283-
if (0 <= c1 && 0 <= c2) {
4284-
c1 = TOLOWER(c1);
4285-
c2 = TOLOWER(c2);
4286-
if (c1 != c2)
4287-
return INT2FIX(c1 < c2 ? -1 : 1);
4312+
while (p1 < p1end && p2 < p2end) {
4313+
if ((r1 = rb_enc_precise_mbclen(p1, p1end, enc1)) <= 0)
4314+
break;
4315+
if ((r2 = rb_enc_precise_mbclen(p2, p2end, enc2)) <= 0)
4316+
break;
4317+
c1 = rb_enc_mbc_to_codepoint(p1, p1end, enc1);
4318+
c2 = rb_enc_mbc_to_codepoint(p2, p2end, enc2);
4319+
p1 += MBCLEN_CHARFOUND_LEN(r1);
4320+
p2 += MBCLEN_CHARFOUND_LEN(r2);
4321+
if (ISASCII(c1)) c1 = TOLOWER(c1);
4322+
if (ISASCII(c2)) c2 = TOLOWER(c2);
4323+
if (c1 != c2) {
4324+
return INT2FIX(c1 < c2 ? -1 : 1);
42884325
}
4289-
else {
4290-
int r;
4291-
l1 = rb_enc_mbclen(p1, p1end, enc);
4292-
l2 = rb_enc_mbclen(p2, p2end, enc);
4293-
len = l1 < l2 ? l1 : l2;
4294-
r = memcmp(p1, p2, len);
4295-
if (r != 0)
4296-
return INT2FIX(r < 0 ? -1 : 1);
4297-
if (l1 != l2)
4298-
return INT2FIX(l1 < l2 ? -1 : 1);
4299-
}
4300-
p1 += l1;
4301-
p2 += l2;
4302-
}
4303-
}
4304-
if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4305-
if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4306-
return INT2FIX(-1);
4326+
}
4327+
if (p1 < p1end && p2 < p2end) {
4328+
len = lesser(p1end-p1, p2end-p2);
4329+
if ((r1 = memcmp(p1, p2, len)) != 0)
4330+
return INT2FIX(r1 < 0 ? -1 : 1);
4331+
p1 += len;
4332+
p2 += len;
4333+
}
4334+
}
4335+
4336+
if (p1 < p1end) return INT2FIX(1);
4337+
if (p2 < p2end) return INT2FIX(-1);
4338+
return INT2FIX(0);
43074339
}
43084340

43094341
/*

test/ruby/test_m17n_comb.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,7 @@ def test_str_casecmp
589589
#puts "#{encdump(s1)}.casecmp(#{encdump(s2)})"
590590
next unless s1.valid_encoding? && s2.valid_encoding? && Encoding.compatible?(s1, s2)
591591
r = s1.casecmp(s2)
592-
assert_equal(s1.upcase <=> s2.upcase, r)
592+
assert_equal(s1.upcase <=> s2.upcase, r, proc {"#{encdump s1}.casecmp(#{encdump s2})"})
593593
}
594594
end
595595

0 commit comments

Comments
 (0)