scrapCrawler.ts
code:scrapCrawler.ts
import * as pagelist from "../allPagesList.ts/allPagesList.ts"
interface ScrapboxPage {
title: string;
links: string[];
relatedPages: { links1hop: Array<{ title: string }> };
}
const spinner = Spinner.getInstance();
new Command()
.name("scrapCrawler")
.version("2024/11/09")
.description("Command line arguments parser")
.option(
"--numberOfParallelConnections -connections <connections:number>",
"",
{
default: 64,
},
)
.option("--output <connections:path>", "", {
default: "result.json",
})
.option("--exclude <regex:string>", "", {
collect: true,
default: [],
})
.option("--pageName <name:string>", "multiple", {
collect: true,
required: true,
})
.option("--projectName <name:string>", "", {
required: true,
})
.parse(Deno.args).then(({ options }) => {
return pagelist.pageCount(options.projectName).then((pageCount) => ({
pageCount,
options,
}));
}).then(({ pageCount, options }) => {
const projectName = options.projectName;
let queue: string[] = options.pageName;
const marked: string[] = [];
const excluded: string[] = [];
const getPage = (pageName: string) => {
for (const element of options.exclude) {
if (new RegExp(element, "i").test(pageName)) {
excluded.push(pageName);
return Promise.resolve();
}
}
return fetch(
projectName +
"/" +
encodeURIComponent(pageName),
).then((pageRes) => {
if (pageRes.ok) {
return pageRes.json();
} else {
return pageRes.text().then((text) => {
console.error(pageRes.status, text);
});
}
}).then((pageJson: Readonly<ScrapboxPage>) => {
marked.push(pageJson.title);
for (const element of pageJson.relatedPages.links1hop) {
if (
!marked.includes(element.title) &&
!excluded.includes(element.title)
) {
queue.push(element.title);
}
}
});
};
function crawlTick(): Promise<void> {
while (queue.length < 1) {
return Promise.resolve();
}
const promiseList: Promise<void>[] = [];
const pageNames = queue.splice(0, options.numberOfParallelConnections);
const elapsedTime = Date.now() - crawlStart;
const remainingPages = marked.length + queue.length;
const pagesPercent = Math.ceil(
(marked.length / (marked.length + queue.length)) * 100,
);
const timeRemaining = (elapsedTime / marked.length) * queue.length;
spinner.setText(
"Crawling...[" +
marked.length.toString().padStart(pageCount.toString().length, "0") +
"/" +
remainingPages.toString().padStart(pageCount.toString().length, "0") +
" " +
pagesPercent.toString().padStart(2, "0") +
"%]ETA" +
Math.ceil(timeRemaining / 1000 / 60)
.toString()
.padStart(2, "0") +
":" +
Math.ceil((timeRemaining / 1000) % 60)
.toString()
.padStart(2, "0") +
"(" +
pageCount +
" in project)(" +
excluded.length.toString().padStart(
pageCount.toString().length,
"0",
) +
" excluded)",
);
for (const element of pageNames) {
promiseList.push(getPage(element));
// console.log(element);
}
return Promise.all(promiseList).then(() => {
queue = queue.filter((element, index) => {
return (
queue.indexOf(element) == index &&
!marked.includes(element) &&
!excluded.includes(element)
);
});
return crawlTick();
});
}
spinner.start("Crawling...");
const crawlStart = Date.now();
return crawlTick().then(() => ({
options,
marked,
excluded,
}));
})
.then(({ options, marked, excluded }) => {
spinner.succeed("Crawled");
spinner.stop();
return pagelist.allpagelist(options.projectName).then((allpagelist) => ({
allpagelist,
options,
marked,
excluded,
}));
}).then(({ allpagelist, options, marked, excluded }) => {
spinner.start("Saving...");
const garbagelist = allpagelist.filter((value) => {
!marked.includes(value);
});
return Deno.writeTextFile(
options.output,
JSON.stringify({
marked: marked,
garbagelist: garbagelist,
alllist: allpagelist,
excluded: excluded,
}),
);
}).then(() => {
spinner.succeed("Saved");
spinner.stop();
});