rb_str_grapheme_clusters
rb_str_grapheme_clusters
strの長さ分のarrayを作ってrb_str_enumerate_grapheme_clustersに渡してそう
rb_str_enumerate_grapheme_clusters
strからencoding入手
キャッシュがあればキャッシュからgrapheme clusters入手?
キャッシュって何
grapheme clustersを取得するための正規表現オブジェクト的な?
そうっぽい。そのためにregcomp onig_compileでコンパイルしてそう
code:c
bool cached_reg_grapheme_cluster = true;
regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
if (!reg_grapheme_cluster) {
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
cached_reg_grapheme_cluster = false;
}
始端と終端の位置を取る
code:c
ptr0 = ptr = RSTRING_PTR(str);
end = RSTRING_END(str);
終端を超えるまでループ
素朴で読みやすい
code:c
while (ptr < end) {
OnigPosition len = onig_match(reg_grapheme_cluster,
(const OnigUChar *)ptr, (const OnigUChar *)end,
(const OnigUChar *)ptr, NULL, 0);
if (len <= 0) break;
ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
ptr += len;
}
len はマッチした位置までのバイト長とかなのか?
onig_match 登場
reg_grapheme_cluster は /\X/ とかなんだろうか
onig_match
Onigurumaのドキュメントだが引数の構造は変わってなさそう
よくわからないが、このあたりが肝になってそう
code:c
if (r == 0) {
prev = (UChar* )onigenc_get_prev_char_head(reg->enc, str, at, end);
r = match_at(reg, str, end,
onigenc_get_prev_char_head はencodingとってきてるだけぽい
match_atが本体か
match_at
おもむろにVM命令列っぽいのがバーン
2000行ある
ScanEnv
code:c
(lldb) p *env
(ScanEnv) {
option = 0
case_fold_flag = 0
enc = NULL
syntax = NULL
capture_history = 0
bt_mem_start = 0
bt_mem_end = 0
backrefed_mem = 0
pattern = 0x0000000000000000
pattern_end = 0x0000000000000000
error = 0x0000000000000000
error_end = 0x0000000000000000
reg = NULL
unset_addr_list = NULL
num_call = 0
num_mem = 0
num_named = 0
mem_alloc = 0
mem_nodes_static = {
}
mem_nodes_dynamic = NULL
parse_depth = 0
warnings_flag = 0
sourcefile = 0x0000000000000000
sourceline = 0
}
code:c
mbc_to_code inlined mbc_enc_len(p="\\X", e="", enc=<unavailable>) at utf_8.c:229:19 226 static int
227 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
228 {
code:c
(lldb) p *tok
(OnigToken) {
type = TK_STRING
escaped = 1
base = 0
backp = 0x000000016fdfe67d "X"
u = {
s = 0x0000000000000058 ""
c = 88
code = 88
anchor = (subtype = 88, ascii_range = 0)
repeat = (lower = 88, upper = 0, greedy = 0, possessive = 0)
backref = {
num = 88
ref1 = 0
refs = 0x0000000000000000
by_name = 0
exist_level = 0
level = 0
}
call = (name = "", name_end = 0x0000000000000000, gnum = 0, rel = 0)
prop = (ctype = 88, not = 0)
}
}
Process 91696 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = step in frame #0: 0x00000001001c588c miniruby`fetch_token(tok=0x000000016fdfe200, src=0x000000016fdfe1f8, end="", env=0x000000016fdfe2c0) at regparse.c:3904:12 3901
3902 case 'X':
3903 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) {
-> 3904 tok->type = TK_EXTENDED_GRAPHEME_CLUSTER;
3905 }
3906 break;
3907
Target 0: (miniruby) stopped.
(lldb) p token
(void *) 0x0000000000000000
(lldb) p tok
(OnigToken *) 0x000000016fdfe200
(lldb) p *tok
(OnigToken) {
type = TK_STRING
escaped = 1
base = 0
backp = 0x000000016fdfe67d "X"
u = {
s = 0x0000000000000058 ""
c = 88
code = 88
anchor = (subtype = 88, ascii_range = 0)
repeat = (lower = 88, upper = 0, greedy = 0, possessive = 0)
backref = {
num = 88
ref1 = 0
refs = 0x0000000000000000
by_name = 0
exist_level = 0
level = 0
}
call = (name = "", name_end = 0x0000000000000000, gnum = 0, rel = 0)
prop = (ctype = 88, not = 0)
}
}
code:c
(lldb) p *node
(Node) {
u = {
base = (type = 0)
str = {
base = (type = 0)
s = 0x0000600001bf3220 ""
end = 0x0000600001bf3220 ""
flag = 0
capa = 0
buf = ""
}
cclass = {
base = (type = 0)
flags = 0
bs = (0 = 29307424, 1 = 24576, 2 = 29307424, 3 = 24576, 4 = 0, 5 = 0, 6 = 0, 7 = 0) mbuf = NULL
}
qtfr = {
base = (type = 0)
state = 0
target = 0x0000600001bf3220
lower = 29307424
upper = 24576
greedy = 0
target_empty_info = 0
head_exact = NULL
next_head_exact = NULL
is_referred = 0
}
enclose = {
base = (type = 0)
state = 0
type = 29307424
regnum = 24576
option = 29307424
call_addr = 24576
target = NULL
min_len = 0
max_len = 0
char_len = 0
opt_count = 0
}
bref = {
base = (type = 0)
state = 0
back_num = 29307424
back_static = (0 = 24576, 1 = 29307424, 2 = 24576, 3 = 0, 4 = 0, 5 = 0) back_dynamic = 0x0000000000000000
nest_level = 0
}
anchor = {
base = (type = 0)
type = 0
target = 0x0000600001bf3220
char_len = 29307424
ascii_range = 24576
}
cons = {
base = (type = 0)
car = 0x0000600001bf3220
cdr = 0x0000600001bf3220
}
ctype = {
base = (type = 0)
ctype = 0
not = 29307424
ascii_range = 24576
}
call = {
base = (type = 0)
state = 0
group_num = 29307424
name = 0x0000600001bf3220 ""
name_end = 0x0000000000000000
target = NULL
unset_addr_list = NULL
}
}
}
code:c
(lldb) p *reg_grapheme_cluster
(regex_t) {
p = 0x000000014e808200 "P>\b"
used = 14098
alloc = 14098
num_mem = 0
num_repeat = 0
num_null_check = 0
num_comb_exp_check = 0
num_call = 0
capture_history = 0
bt_mem_start = 0
bt_mem_end = 0
stack_pop_level = 0
repeat_range_alloc = 0
options = 57344
repeat_range = NULL
enc = 0x0000600003cf0090
syntax = 0x000000010043821c
name_table = 0x0000000000000000
case_fold_flag = 1073741824
optimize = 0
threshold_len = 0
anchor = 0
anchor_dmin = 0
anchor_dmax = 0
sub_anchor = 0
exact = 0x0000000000000000
exact_end = 0x0000000000000000
map = ""
int_map = 0x0000000000000000
int_map_backward = 0x0000000000000000
dmin = 0
dmax = 0
timelimit = 0
chain = NULL
}
code:c
Process 33084 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = step in frame #0: 0x00000001001d4ed0 miniruby`node_extended_grapheme_cluster(np=0x000000016fdfd090, env=0x000000016fdfd240) at regparse.c:5991:7 5988 if (IS_NULL(alts0)) goto err; 5989
5990 #ifdef USE_UNICODE_PROPERTIES -> 5991 if (ONIGENC_IS_UNICODE(env->enc)) { /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
5992 CClassNode* cc;
5993
5994 if (propname2ctype(env, "Grapheme_Cluster_Break=Extend") < 0) goto err;
Target 0: (miniruby) stopped.
(lldb) p **alts
(Node) {
u = {
base = (type = 0)
str = {
base = (type = 0)
s = 0x000060000139b420 "\r\n"
end = 0x000060000139b422 ""
flag = 1
capa = 0
buf = "\r\n"
}
cclass = {
base = (type = 0)
flags = 0
bs = (0 = 20558880, 1 = 24576, 2 = 20558882, 3 = 24576, 4 = 1, 5 = 0, 6 = 2573, 7 = 0) mbuf = NULL
}
qtfr = {
base = (type = 0)
state = 0
target = 0x000060000139b420
lower = 20558882
upper = 24576
greedy = 1
target_empty_info = 0
head_exact = 0x0000000000000a0d
next_head_exact = NULL
is_referred = 0
}
enclose = {
base = (type = 0)
state = 0
type = 20558880
regnum = 24576
option = 20558882
call_addr = 24576
target = 0x0000000000000001
min_len = 2573
max_len = 0
char_len = 0
opt_count = 0
}
bref = {
base = (type = 0)
state = 0
back_num = 20558880
back_static = (0 = 24576, 1 = 20558882, 2 = 24576, 3 = 1, 4 = 0, 5 = 2573) back_dynamic = 0x0000000000000000
nest_level = 0
}
anchor = {
base = (type = 0)
type = 0
target = 0x000060000139b420
char_len = 20558882
ascii_range = 24576
}
cons = {
base = (type = 0)
car = 0x000060000139b420
cdr = 0x000060000139b422
}
ctype = {
base = (type = 0)
ctype = 0
not = 20558880
ascii_range = 24576
}
call = {
base = (type = 0)
state = 0
group_num = 20558880
name = 0x000060000139b422 ""
name_end = 0x0000000000000001 ""
target = 0x0000000000000a0d
unset_addr_list = NULL
}
}
}
---
code:test.rb
"क्त".grapheme_clusters
code:c
if (ONIGENC_IS_UNICODE(env->enc)) { /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
CClassNode* cc;
(lldb) p *cc
(CClassNode) {
base = (type = 66649936)
flags = 24576
bs = (0 = 0, 1 = 4, 2 = 0, 3 = 0, 4 = 0, 5 = 0, 6 = 0, 7 = 0) mbuf = NULL
}
(lldb) p *env
(ScanEnv) {
option = 57344
case_fold_flag = 1073741824
enc = 0x0000600000f8c090
syntax = 0x00000001004532e8
capture_history = 0
bt_mem_start = 0
bt_mem_end = 0
backrefed_mem = 0
pattern = 0x000000016fdfe67c "\\X"
pattern_end = 0x000000016fdfe67e ""
error = 0x0000000000000000
error_end = 0x0000000000000000
reg = 0x0000000137704a40
unset_addr_list = NULL
num_call = 0
num_mem = 0
num_named = 0
mem_alloc = 0
mem_nodes_static = {
}
mem_nodes_dynamic = NULL
parse_depth = 1
warnings_flag = 0
sourcefile = 0x0000000000000000
sourceline = 0
}
(lldb) p *np
(Node *) NULL
(lldb) p **alts
(Node) {
u = {
base = (type = 0)
str = {
base = (type = 0)
s = 0x000060000288f060 "\r\n"
end = 0x000060000288f062 ""
flag = 1
capa = 0
buf = "\r\n"
}
cclass = {
base = (type = 0)
flags = 0
bs = (0 = 42528864, 1 = 24576, 2 = 42528866, 3 = 24576, 4 = 1, 5 = 0, 6 = 2573, 7 = 0) mbuf = NULL
}
qtfr = {
base = (type = 0)
state = 0
target = 0x000060000288f060
lower = 42528866
upper = 24576
greedy = 1
target_empty_info = 0
head_exact = 0x0000000000000a0d
next_head_exact = NULL
is_referred = 0
}
enclose = {
base = (type = 0)
state = 0
type = 42528864
regnum = 24576
option = 42528866
call_addr = 24576
target = 0x0000000000000001
min_len = 2573
max_len = 0
char_len = 0
opt_count = 0
}
bref = {
base = (type = 0)
state = 0
back_num = 42528864
back_static = (0 = 24576, 1 = 42528866, 2 = 24576, 3 = 1, 4 = 0, 5 = 2573) back_dynamic = 0x0000000000000000
nest_level = 0
}
anchor = {
base = (type = 0)
type = 0
target = 0x000060000288f060
char_len = 42528866
ascii_range = 24576
}
cons = {
base = (type = 0)
car = 0x000060000288f060
cdr = 0x000060000288f062
}
ctype = {
base = (type = 0)
ctype = 0
not = 42528864
ascii_range = 24576
}
call = {
base = (type = 0)
state = 0
group_num = 42528864
name = 0x000060000288f062 ""
name_end = 0x0000000000000001 ""
target = 0x0000000000000a0d
unset_addr_list = NULL
}
}
}
Process 35324 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = step in frame #0: 0x00000001001d4f44 miniruby`node_extended_grapheme_cluster(np=0x000000016fdfe110, env=0x000000016fdfe2c0) at regparse.c:6005:5 6002 alts1 = node_new_cclass(); 6003 if (IS_NULL(alts1)) goto err; 6004 cc = NCCLASS(alts1); -> 6005 R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
6006 if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
6007 R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
6008 R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
Target 0: (miniruby) stopped.
(lldb) p cc
(CClassNode *) 0x000060000288fc40
(lldb) p *cc
(CClassNode) {
base = (type = 1)
flags = 0
bs = (0 = 0, 1 = 0, 2 = 0, 3 = 0, 4 = 0, 5 = 0, 6 = 0, 7 = 0) mbuf = NULL
}