scrapboxとHugoを同期させる

#misc #SEO

Scrapboxと個人ブログ(Hugo)を同期させるようにした.

scrapboxとクローラでも言及したが, 空のリンクに検索がヒットするのはよくないと思い, 同期を始めた.

scrapbox自体は書き心地やUXが最高で手放したくないため, 一部ページを同期させ, 正しく検索結果が載るか試してみる.

コードは以下に示す通り.

scrapbox記法からmarkdownへの変換はこちらを改変したものを使用.

javascriptのreplaceは割と万能である.

shell script / python / Node.js が動く環境があればOK.

code:main.sh

url='https://scrapbox.io/api/pages/yuwd?limit=500'

# url='https://scrapbox.io/api/pages/yuwd?limit=10'

dates=(curl $url | jq ".pages[].updated" | xargs)

echo "COUNT=${#dates@}"

i=0

IFS=$'\n'

for title in $(curl $url | jq ".pages[].title" | sed -e "s/^\"//g" | sed -e "s/\"$//g")

md_title=echo $title | tr ' ' '_' | sed s/\?//g | sed s/!//g | sed s/://g

etitle=$(echo $title | python -c 'import sys;from urllib.parse import quote; [print(quote(l):-3,end="") for l in sys.stdin]')

if ! -e $md_title.md ; then

echo $title

url="https://scrapbox.io/api/pages/yuwd/${etitle}/text"

echo $url

curl $url | node sb2md.js > $md_title.md

python scrapbox.py $md_title.md "${dates$i}"

i=expr $i + 1

done

code:sb2md.js

// https://gist.github.com/yuntan/bb82cdf336ec76a15c66b910754f5f33

if (!Object.prototype.then) {

Object.prototype.then = function (f) { return f.call(null, this); }

}

process.stdin.resume();

process.stdin.setEncoding('utf8');

let input_string = '';

process.stdin.on('data', chunk => {

input_string += chunk;

});

process.stdin.on('end', () => {

const text = input_string;

console.log(sb2md(text));

});

function sb2md(text) {

// code block

const escapeCodeBlocks = s => s.replace(

/^code:(.+)$((\n^ \t.*$)+)/mg,

(_, p1, p2) =>

'' + p1 + p2.replace(/^[ \t]/mg, '').replace(/\r|\n|\r\n/g, '+++') + '+++'

);

const unescapeCodeBlocks = s => s.replace(/\+{3}/g, '\n');

const replaceLine = line =>

/^`{3}/.test(line) ? line :

// level 2 heading

line.replace(/^\[\[([^\\]+)\]\]$/, '## $1')

.replace(/^\[\*\s+(\S[^\\]*)\]$/, '## $1')

// anchor link

.replace(/\(\S.*)\s+(https?:\/\/\S+)\/g, '$1($2)')

.replace(/\(https?:\/\/\S+)\s+(\S.*)\/g, '$2($1)')

// image block

.replace(/^\(https?:\/\/\S+\.(png|gif|jpe?g))\$/, '![]($1)')

.replace(/^\(https:\/\/gyazo.com\/\S+)\$/, '![]($1.png)')

// unordered list

.replace(/^\s{6}(\S.*)$/, ' - $1')

.replace(/^\s{5}(\S.*)$/, ' - $1')

.replace(/^\s{4}(\S.*)$/, ' - $1')

.replace(/^\s{3}(\S.*)$/, ' - $1')

.replace(/^\s{2}(\S.*)$/, ' - $1')

.replace(/^\s(\S.*)$/, '- $1')

// bold text

.replace(/\[\[([^\\]+)\]\]/g, '**$1**')

.replace(/\[\*\s+([^\\]+)\]/g, '**$1**')

// italic text

.replace(/\[\/\s+([^\\]+)\]/g, '*$1*');

return text

.then(escapeCodeBlocks)

.split(/\r|\n|\r\n/)

// first line is level 1 heading

.then(lines => [lines0.replace(/^(.+)$/, '$1')].concat(lines.slice(1)))

.map(replaceLine)

.join('\n')

.then(unescapeCodeBlocks);

}

code:scrapbox.py

import os

import re

import sys

import datetime

import requests

from urllib.parse import quote

head_pattern = re.compile(r"\[\*# (^\$\*[^\\]+)\]^\(")

math_pattern = re.compile(r"\[\$\s+(^\]+)\]")

link_pattern = re.compile(r"\[(^\$\*[^\\]+)\]^\(")

url_link_pattern = re.compile(r"\[(^\$\*[^\\]+) (https?://^\]+)\]")

tag_pattern = re.compile(r'#(^\s+)')

img_pattern = re.compile(r'!\\$(https?://.+)$')

large_math_pattern = re.compile(r'\s*-\s+(\$^\$+\$)\s*$')

if len(sys.argv) < 3:

print('Usage: scrapbox.py <file> <date>')

sys.exit(1)

print(sys.argv)

lines = []

tags = []

in_snippets = False

with open(sys.argv1, 'r') as f:

lines = f.readlines()

for i, line in enumerate(lines):

if len(line.replace(" ","")) <= 1: continue

if line.startswith("`"): in_snippets = not in_snippets

if in_snippets: continue

for j in range(6):

hash = "".join('*'*(6-j))

t = f"[{hash} ["

has_t = t in linesi

linesi = linesi.replace(t,"![](")

if has_t:

linesi = linesi:-3 + ')\n'

line = linesi

head_match = head_pattern.finditer(line)

math_match = math_pattern.finditer(line)

links_match = link_pattern.finditer(line)

tags_match = tag_pattern.finditer(line)

url_links_match = url_link_pattern.finditer(line)

if head_match:

targets = set()

for _link in head_match:

link = _link.group(1)

targets.add(link)

for link in targets:

linesi = linesi.replace(f"{link}", f"### {link}")

if math_match:

targets = set()

for _link in math_match:

link = _link.group(1)

targets.add(link)

for link in targets:

update_link = link.replace("_",r"\_")

linesi = linesi.replace(f"{link}]", f"${update_link}$").replace("[$","")

if url_links_match:

targets = set()

for url_link in url_links_match:

title = url_link.group(1)

url = url_link.group(2)

targets.add((title,url))

for title, url in targets:

linesi = linesi.replace(f"{title} {url}",f"{title}({url})")

if links_match:

targets = set()

for _link in links_match:

link = _link.group(1)

targets.add(link)

for link in targets:

md_path = link.replace(' ','_').replace('?','').replace('!','').replace(':','') + ".md"

md = '{{< ref "' + md_path + '" >}}'

if os.path.exists(md_path):

linesi = linesi.replace(f"{link}",f"{link}({md}/)")

else:

linesi = linesi.replace(f"{link}",f"{link}(https://scrapbox.io/yuwd/{quote(link)})")

if tags_match and i == 1:

for _tag in tags_match:

tag = _tag.group(1)

tags.append(tag)

linesi = linesi.replace(f"#{tag}",f"")

img_match = img_pattern.finditer(linesi)

if img_match:

for img in img_match:

url = img.group(1)

img_url = url

exts = 'png','gif','jpeg','jpg','webp'

if "https://gyazo.com/" in url:

for e in exts:

if url.endswith(e):

img_url = url

break

img_url = f"{url}.{e}"

res = requests.get(img_url)

if res.status_code == 200:

break

linesi = linesi.replace(f"![]({url})",'{{<' + f'img src="{img_url}" position="center"' + '>}}<br>')

large_math_match = large_math_pattern.finditer(linesi)

if large_math_match:

for _math in large_math_match:

math = _math.group(1)

linesi = f"${math}$\n"

meta = \

"""

---

title: "TITLE"

date: DATE

description:

draft: false

hideToc: false

enableToc: true

enableTocContent: true

tocPosition: inner

tags: