このマシュマロの質問解答を集めたScrapboxはどうやって作ったのですか？

質問

解答

恥ずかしながら、マシュマロの質問に対する解答を、まったくローカルに残していなかったので、データを拾い集めるところからはじめました。

まず質問の方は、マシュマロの確認済みのページに残っています。

かなりの数だったので手作業コピペはあきらめ、Python＋Selenuimで自動巡回で吸い出してもらいました。

code:get_questions_from_marshmallow.py

from selenium import webdriver

browser = webdriver.Chrome() #使ってるブラウザ、ここではChromeを指定

def get_questions_from_marshmallow():

root_url = 'https://marshmallow-qa.com/messages/answered?page=%s' #確認済みのマシュマロ

counter = 1

questions = []

#確認済みページへ接続

browser.get(root_url%str(counter))

while True:

#空っぽのページかチェック

if '表示できるメッセージがないようです' in browser.find_element_by_css_selector('body > main > div').text:

break

#そのページの内容を抽出

root_now_xpath = '/html/body/main/div/ul/li%s/div/div/div2/a'

for i in range(2,33):

now_item = browser.find_elements_by_xpath(root_now_xpath%str(i))

if now_item:

questions.append (now_item0.text)

try:

#次のページへいく

counter += 1

browser.get(root_url%str(counter))

except:

##次のページへいくボタンがなければ例外が出るので、そこで終了

print("NoSuchElementException")

break

return questions

解答の方はTwitter上に流されているので、まずユーザー情報のページに有る「全ツイートの履歴をリクエスト」機能をつかって全ツイートを含むcvsファイルを取得しました。

全ツイートを含むcvsファイルから、マシュマロへの解答ツイートだけを抽出しました。

マシュマロへの解答は１つのツイートだけのものあれば、複数のツイートでできたスレッドによるものもあります。

スレッドになっている解答をあつめるために、解答ツイートのtweet_idをin_reply_to_status_idに含むツイートを集め、さらにそのtweet_idをin_reply_to_status_idに含むツイートを集め…を繰り返して、スレッド解答をまとめました。

code:get_answers_from_tweet_csv.py

import pandas as pd

#スレッドをたどる

def get_text(now_id):

return no_retweets[no_retweets'tweet_id' == now_id].text.values0

def thread(now_id):

# 元のidのtextを積む

text_data = get_text(now_id)

# 元のidをin_reply_to_status_idに含むものを探す

has_in_reply_to_status_id = no_retweets[no_retweets'in_reply_to_status_id' == now_id]

#見つかるうちは、元のidをin_reply_to_status_idに含むものを探す

while len(has_in_reply_to_status_id)>0:

now_id = has_in_reply_to_status_id.tweet_id.values0

text_data += '\n' + get_text(now_id)

#print (now_id, text_data)

has_in_reply_to_status_id = no_retweets[no_retweets'in_reply_to_status_id' == now_id]

return text_data

def get_answer_from_tweet_csv():

#全ツイートcvsファイル読み込み

tweets_df = pd.read_csv('tweets.csv')

#リツイートは省く

no_retweets = tweets_df[tweets_df'retweeted_status_id'.isnull()]

# expanded_urlsを持つものだけを抽出

no_retweets_with_expanded_urls = no_retweets[no_retweets'expanded_urls'.notnull()]

# expanded_urlsにhttps://marshmallow-qa.comを含むもの＝マショマロの解答のトップ　を抽出

marshmallow_tops = no_retweets_with_expanded_urls.query('source.str.contains("https://marshmallow-qa.com")', engine='python')

#答えを収集しリストに格納

answers = [ now_tweet_id, thread(now_tweet_id) for now_tweet_id in marshmallow_tops'tweet_id']

return answers

こうしてできた質問データと解答データを突き合わせて（手作業が少し必要だったのでExcel上でやりました）、

最後にjsonで書き出したものをScrapboxにインポートしました。

code:marshmallow_to_scrapbox.py

import pandas as pd

import json

import re

#１つの問答をjson用のdictに変換する補助関数

def qa2dict(now_qa):

now_question = now_qa0

now_answer = now_qa1

#『』で囲まれたものを書物として抽出して、タグ化する

book_list = re.findall('『^』+』' , now_answer)

book_list = ['+ book + '' for book in book_list]

now_question_list = now_question.split('\n')

now_answer_list = now_answer.split('\n')

qa_dict ={}

qa_dict'title' = now_question_list0:30 #　質問の１行目をタイトルに

qa_dict'lines' = ['','質問'] + now_question_list

qa_dict'lines' += ['','解答'] + now_answer_list

if book_list:

qa_dict'lines' += ['','文献'] + book_list

return qa_dict

#マシュマロの全問答を集めたエクセルファイルを読み込み

mash_df = pd.read_excel('マシュマロ全質問回答.xlsx')

#リストに変換

mash_list = mash_df.values.tolist() #[質問, 解答,質問, 解答,…]

#問答を１つずつjson用のdictに変換し、格納

dict_list = []

for mash_qa in mash_list:

dict_list.append(qa2dict(mash_qa))

#出力用のjsonのdictをつくる

pages_dict = dict(pages=dict_list)

#jsonファイルを書き出し

f2 = open('mash.json', 'w')

json.dump(pages_dict, f2, ensure_ascii=False, indent=4, sort_keys=False)

以上となります。