X Tutup
Skip to content

Commit dca824c

Browse files
authored
Merge pull request #6393 from Kelimion/unicode
Unicode
2 parents 1b23231 + d880404 commit dca824c

File tree

10 files changed

+46464
-34
lines changed

10 files changed

+46464
-34
lines changed

core/unicode/generated.odin

Lines changed: 2335 additions & 0 deletions
Large diffs are not rendered by default.

core/unicode/inrange.odin

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package unicode
2+
3+
/*
4+
Check to see if the rune `r` is in `range`
5+
*/
6+
in_range :: proc(r: rune, range: Range) -> bool {
7+
8+
if r <= 0xFFFF {
9+
r16 := cast(u16) r
10+
11+
length := len(range.ranges_16)
12+
index := binary_search(r16, range.ranges_16, length/2, 2) if length > 0 else -1
13+
if index >= 0 && range.ranges_16[index] <= r16 && range.ranges_16[index+1] >= r16 {
14+
return true
15+
}
16+
17+
length = len(range.single_16)
18+
index = binary_search(r16, range.single_16, length, 1) if length > 0 else -1
19+
if index >= 0 && range.single_16[index] == r16 {
20+
return true
21+
}
22+
}
23+
24+
r32 := cast(i32) r
25+
26+
length := len(range.ranges_32)
27+
index := binary_search(r32, range.ranges_32, length/2, 2) if length >0 else -1
28+
if index >= 0 && range.ranges_32[index] <= r32 && range.ranges_32[index+1] >= r32 {
29+
return true
30+
}
31+
32+
length = len(range.single_32)
33+
index = binary_search(r32, range.single_32, length, 1) if length > 0 else -1
34+
if index >= 0 && range.single_32[index] == r32 {
35+
return true
36+
}
37+
38+
39+
return false
40+
}
41+
42+
43+
44+
45+

core/unicode/letter.odin

Lines changed: 93 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ ZERO_WIDTH_JOINER :: '\u200D'
1313
WORD_JOINER :: '\u2060'
1414

1515
@(require_results)
16-
binary_search :: proc(c: i32, table: []i32, length, stride: int, loc := #caller_location) -> int #no_bounds_check {
16+
binary_search :: proc(c: $T, table: []T, length, stride: int, loc := #caller_location) -> int #no_bounds_check {
1717
runtime.bounds_check_error_loc(loc, length*stride-1, len(table))
1818
n := length
1919
t := 0
@@ -75,36 +75,30 @@ is_lower :: proc(r: rune) -> bool #no_bounds_check {
7575
if r <= MAX_ASCII {
7676
return u32(r)-'a' < 26
7777
}
78-
c := i32(r)
79-
p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3)
80-
if p >= 0 && to_upper_ranges[p] <= c && c <= to_upper_ranges[p+1] {
81-
return true
82-
}
83-
p = binary_search(c, to_upper_singlets[:], len(to_upper_singlets)/2, 2)
84-
if p >= 0 && c == to_upper_singlets[p] {
85-
return true
86-
}
87-
return false
78+
return in_range(r, ll_ranges) || in_range(r, other_lowercase_ranges)
8879
}
8980

9081
@(require_results)
9182
is_upper :: proc(r: rune) -> bool #no_bounds_check {
9283
if r <= MAX_ASCII {
9384
return u32(r)-'A' < 26
9485
}
95-
c := i32(r)
96-
p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3)
97-
if p >= 0 && to_lower_ranges[p] <= c && c <= to_lower_ranges[p+1] {
98-
return true
99-
}
100-
p = binary_search(c, to_lower_singlets[:], len(to_lower_singlets)/2, 2)
101-
if p >= 0 && c == to_lower_singlets[p] {
102-
return true
103-
}
104-
return false
86+
return in_range(r, lu_ranges) || in_range(r, other_uppercase_ranges)
10587
}
10688

10789
is_alpha :: is_letter
90+
91+
/*
92+
Return true if the rune `r` is a letter. Being a letter means that the rune has
93+
the Unicode general category property of L. In practice, the character will have
94+
a general category property of Ll, Lm, Lo, Lt, or Lu.
95+
96+
Inputs:
97+
- r: The rune which will be check for having the property of being a letter.
98+
99+
Returns:
100+
`true` when the rune `r` is a letter. `false` will be returned in all other cases.
101+
*/
108102
@(require_results)
109103
is_letter :: proc(r: rune) -> bool #no_bounds_check {
110104
if u32(r) <= MAX_LATIN1 {
@@ -114,28 +108,55 @@ is_letter :: proc(r: rune) -> bool #no_bounds_check {
114108
return true
115109
}
116110

117-
c := i32(r)
118-
p := binary_search(c, alpha_ranges[:], len(alpha_ranges)/2, 2)
119-
if p >= 0 && alpha_ranges[p] <= c && c <= alpha_ranges[p+1] {
120-
return true
121-
}
122-
p = binary_search(c, alpha_singlets[:], len(alpha_singlets), 1)
123-
if p >= 0 && c == alpha_singlets[p] {
124-
return true
125-
}
126-
return false
111+
ll_lu := in_range(r, ll_ranges) || in_range(r, lu_ranges)
112+
113+
return ll_lu || in_range(r, lo_ranges) || in_range(r, lt_ranges) || in_range(r, lm_ranges)
127114
}
128115

129116
@(require_results)
130117
is_title :: proc(r: rune) -> bool {
131118
return is_upper(r) && is_lower(r)
132119
}
133120

121+
/*
122+
Returns true if the rune `r` is in the General Category Nd
123+
124+
Inputs:
125+
- r: The run to check if it is in the general category Nd.
126+
127+
Returns:
128+
`true` if the rune is in the general category Nd and `false` otherwise
129+
130+
*/
131+
is_decimal :: proc(r: rune) -> bool {
132+
return in_range(r, nd_ranges)
133+
}
134+
135+
/*
136+
This function determincs if a rune is a digit. To be a digit the
137+
charage either has a Numeric_Type of Digit or Decimal.
138+
139+
Inputs:
140+
- r: The rune to check if it is a digit.
141+
142+
Returns:
143+
`true` if the rune `r` is a digit, `false` in all other cases
144+
145+
*/
134146
@(require_results)
135147
is_digit :: proc(r: rune) -> bool {
136148
if r <= MAX_LATIN1 {
137-
return '0' <= r && r <= '9'
149+
return ('0' <= r && r <= '9') || r == 0x00B9 || (r >= 0x00B2 && r <= 0x0B3)
138150
}
151+
152+
if in_range(r, nd_ranges) {
153+
return true
154+
}
155+
156+
if in_range(r, extra_digits_ranges) {
157+
return true
158+
}
159+
139160
return false
140161
}
141162

@@ -176,6 +197,15 @@ is_graphic :: proc(r: rune) -> bool {
176197
if u32(r) <= MAX_LATIN1 {
177198
return char_properties[u8(r)]&pg != 0
178199
}
200+
201+
if is_letter(r) || is_number(r) || is_punct(r) || is_symbol(r) || in_range(r, zs_ranges) {
202+
return true
203+
}
204+
205+
if in_range(r, mc_ranges) || in_range(r, me_ranges) || in_range(r, mn_ranges) {
206+
return true
207+
}
208+
179209
return false
180210
}
181211

@@ -195,27 +225,56 @@ is_control :: proc(r: rune) -> bool #no_bounds_check {
195225
return false
196226
}
197227

228+
/*
229+
Checks to see if the rune `r` is a number. This means the rune is a member
230+
of the general category Nd, Nl, or No.
231+
232+
Inputs:
233+
r: The rune to check if it is number.
234+
235+
Returns:
236+
`true` if the ruen belongs to the general category Nd, Nl, or No. `false`
237+
is return in all other cases.
238+
239+
*/
198240
@(require_results)
199241
is_number :: proc(r: rune) -> bool #no_bounds_check {
200242
if u32(r) <= MAX_LATIN1 {
201243
return char_properties[u8(r)]&pN != 0
202244
}
203-
return false
245+
246+
return in_range(r, nd_ranges) || in_range(r, nl_ranges) || in_range(r, no_ranges)
204247
}
205248

206249
@(require_results)
207250
is_punct :: proc(r: rune) -> bool #no_bounds_check {
208251
if u32(r) <= MAX_LATIN1 {
209252
return char_properties[u8(r)]&pP != 0
210253
}
211-
return false
254+
255+
if in_range(r, pc_ranges) || in_range(r, pd_ranges) || in_range(r, pe_ranges) {
256+
return true
257+
}
258+
259+
if in_range(r, pf_ranges) || in_range(r, pi_ranges) || in_range(r, po_ranges) {
260+
return true
261+
}
262+
263+
return in_range(r, ps_ranges)
212264
}
213265

214266
@(require_results)
215267
is_symbol :: proc(r: rune) -> bool #no_bounds_check {
216268
if u32(r) <= MAX_LATIN1 {
217269
return char_properties[u8(r)]&pS != 0
218270
}
271+
272+
s := in_range(r, sc_ranges) || in_range(r, sm_ranges)
273+
274+
if s || in_range(r, so_ranges) || in_range(r, sk_ranges) {
275+
return true
276+
}
277+
219278
return false
220279
}
221280

0 commit comments

Comments
 (0)
X Tutup