asearch
曖昧テキスト検索ツール
Bitapアルゴリズムに基づく
/masui/曖昧検索asearch
Ruby用asearch by 増井俊之.icon
JavaScript(Node)用asearch by /shokai/shokai.icon
Scrapboxのタイトル検索で利用
橋本商会 » あいまいテキスト検索 AsearchをNodeに移植した
はやい
byte列を比較しているだけ
結果はtrue/falseで返ってくる
あいまい度は指定できる(0〜3まで)
他のライブラリに依存していない、pure javascript
/villagepump/scrapboxのリンクサジェスト、とても速くないか?
とてもわかり易い仕組みの図解
/nota-techconf/本当に役立つFAQ検索システムを目指して#6049e61eadf4e700007d1eef
2021-05-19 21:38:33 Unicode対応した
cf. https://github.com/masui/node-asearch/blob/master/src/asearch.coffee#L59
UserScript/denoから使えるcode
/takker/fizzSearchのコードを改変してある
code:script.js
export class Asearch {
get INITPAT() {return 0x80000000;} // 0100,0000,0000,0000,0000,0000,0000,0000
get MAXCHAR() {return 0x100;} // 256文字が上限
get INITSTATE() {return this.INITPAT, 0, 0, 0;}
constructor(source) {
this.shiftpat = Array(this.MAXCHAR).map(_ => 0);
this.epsilon = 0;
let mask = this.INITPAT;
for (const item of this.unpack(source)) {
// 0x20 is a space
if (item === 0x20) {
this.epsilon |= mask;
} else {
this.shiftpatitem |= mask;
this.shiftpatthis.toupper(item) |= mask;
this.shiftpatthis.tolower(item) |= mask;
mask >>>= 1;
}
}
this.acceptpat = mask;
}
tolower(c) {
return this.isupper(c) ? c + 0x20 : c;
}
toupper(c) {
return this.islower(c) ? c - 0x20 : c;
}
isupper(c) {
// 0x41 = A, 0x5a = Z
return (c >= 0x41) && (c <= 0x5a);
}
islower(c) {
// 0x61 = a, 0x7a = z
return (c >= 0x61) && (c <= 0x7a);
}
state(str = '') {
let i0 = this.INITSTATE0;
let i1 = this.INITSTATE1;
let i2 = this.INITSTATE2;
let i3 = this.INITSTATE3;
for (const item of this.unpack(str)) {
const mask = this.shiftpatitem;
i3 = (i3 & this.epsilon) | ((i3 & mask) >>> 1) | (i2 >>> 1) | i2;
i2 = (i2 & this.epsilon) | ((i2 & mask) >>> 1) | (i1 >>> 1) | i1;
i1 = (i1 & this.epsilon) | ((i1 & mask) >>> 1) | (i0 >>> 1) | i0;
i0 = (i0 & this.epsilon) | ((i0 & mask) >>> 1);
i1 |= i0 >>> 1;
i2 |= i1 >>> 1;
i3 |= i2 >>> 1;
}
return i0, i1, i2, i3;
}
match(str, ambig = 0) {
ambig = ambig < this.INITSTATE.length ? ambig : this.INITSTATE.length - 1;
const s = this.state(str);
return (sambig & this.acceptpat) !== 0;
}
unpack(str) {
return str.split('').map(char => char.charCodeAt(0));
}
// UTF-16 code pointを上8bitと下8bitとに分割する
/*unpack(str) {
return str.split('')
.flatMap(char => {
const code = char.charCodeAt(0);
if (code > 0xFF) {
return (code & 0xFF00) >>> 8, code & 0xFF;
}
return code & 0xFF; // 下の8ビットを取得する
});
}*/
}
rust-asearch
test code
shokai/node-asearchをJSに書き換えたのを使っている
code:js
(async () => {
const {Asearch} = await import('/api/code/programming-notes/asearch/script.js');
const a = new Asearch('abcde');
assert(a, 'abcde');
assert(a, 'AbCdE');
assert(a, 'abcd');
assert(a, 'abcd', 1);
assert(a, 'ab de', 1);
assert(a, 'abe', 1);
assert(a, 'abe', 2);
const b = new Asearch('cheese burger');
assert(b, 'cheese burger');
assert(b, 'chess burger');
assert(b, 'chess burger', 2);
assert(b, 'chess', 2);
const c = new Asearch('漢字文字列');
assert(c, '漢字文字列');
assert(c, '漢字文字烈');
assert(c, '漢字文字烈', 1);
function assert(asearch, text, ambig = 0) {
console.log(text = ${text}, ambig = ${ambig} => ${asearch.match(text, ambig)});
}
})();
WebWorker用
code:worker.js
class Asearch {
get INITPAT() {return 0x80000000;} // 0100,0000,0000,0000,0000,0000,0000,0000
get MAXCHAR() {return 0x100;} // 256文字が上限
get INITSTATE() {return this.INITPAT, 0, 0, 0;}
constructor(source) {
this.shiftpat = Array(this.MAXCHAR).map(_ => 0);
this.epsilon = 0;
let mask = this.INITPAT;
for (const item of this.unpack(source)) {
// 0x20 is a space
if (item === 0x20) {
this.epsilon |= mask;
} else {
this.shiftpatitem |= mask;
this.shiftpatthis.toupper(item) |= mask;
this.shiftpatthis.tolower(item) |= mask;
mask >>>= 1;
}
}
this.acceptpat = mask;
}
tolower(c) {
return this.isupper(c) ? c + 0x20 : c;
}
toupper(c) {
return this.islower(c) ? c - 0x20 : c;
}
isupper(c) {
// 0x41 = A, 0x5a = Z
return (c >= 0x41) && (c <= 0x5a);
}
islower(c) {
// 0x61 = a, 0x7a = z
return (c >= 0x61) && (c <= 0x7a);
}
state(str = '') {
let i0 = this.INITSTATE0;
let i1 = this.INITSTATE1;
let i2 = this.INITSTATE2;
let i3 = this.INITSTATE3;
for (const item of this.unpack(str)) {
const mask = this.shiftpatitem;
i3 = (i3 & this.epsilon) | ((i3 & mask) >>> 1) | (i2 >>> 1) | i2;
i2 = (i2 & this.epsilon) | ((i2 & mask) >>> 1) | (i1 >>> 1) | i1;
i1 = (i1 & this.epsilon) | ((i1 & mask) >>> 1) | (i0 >>> 1) | i0;
i0 = (i0 & this.epsilon) | ((i0 & mask) >>> 1);
i1 |= i0 >>> 1;
i2 |= i1 >>> 1;
i3 |= i2 >>> 1;
}
return i0, i1, i2, i3;
}
match(str, ambig = 0) {
ambig = ambig < this.INITSTATE.length ? ambig : this.INITSTATE.length - 1;
const s = this.state(str);
return (sambig & this.acceptpat) !== 0;
}
// UTF-16 code pointを上8bitと下8bitとに分割する
unpack(str) {
return str.split('')
.flatMap(char => {
const code = char.charCodeAt(0);
if (code > 0xFF) {
return (code & 0xFF00) >>> 8, code & 0xFF;
}
return code & 0xFF; // 下の8ビットを取得する
});
}
}
JavaScript.icon
Deno.icon