スクレイピング
code:R
# パッケージの読み込み
install.packages(c("rvest", "dplyr", "purrr", "stringr", "magrittr", "tidyr", "lubridate")) # ← 追加で tidyr, lubridate も必要
library(rvest)
library(dplyr)
library(purrr)
library(stringr)
library(magrittr)
library(tidyr)
library(lubridate)
# 川崎市(川崎区)のベースURL
base_url <- "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&ta=14&sc=14131&cb=0.0&ct=9999999&et=15&md=01&md=02&md=03&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# スクレイピング関数(1ページ分)
scrape_suumo_page <- function(url) {
message("アクセス中: ", url)
page <- tryCatch({
read_html(url)
}, error = function(e) {
warning("読み込み失敗: ", url)
return(tibble())
})
listings <- page %>% html_elements(".cassetteitem")
if (length(listings) == 0) {
warning("物件が見つかりませんでした: ", url)
return(tibble())
}
map_dfr(listings, function(listing) {
tibble(
name = listing %>% html_element(".cassetteitem_content-title") %>% html_text(trim = TRUE),
address = listing %>% html_element(".cassetteitem_detail-col1") %>% html_text(trim = TRUE),
access = listing %>% html_element(".cassetteitem_detail-col2") %>% html_text(trim = TRUE),
age = listing %>% html_element(".cassetteitem_detail-col3") %>% html_text(trim = TRUE),
rent = listing %>% html_element(".cassetteitem_price--rent") %>% html_text(trim = TRUE),
management_fee = listing %>% html_element(".cassetteitem_price--administration") %>% html_text(trim = TRUE),
deposit = listing %>% html_element(".cassetteitem_price--deposit") %>% html_text(trim = TRUE),
key_money = listing %>% html_element(".cassetteitem_price--gratuity") %>% html_text(trim = TRUE),
layout = listing %>% html_element(".cassetteitem_madori") %>% html_text(trim = TRUE),
area = listing %>% html_element(".cassetteitem_menseki") %>% html_text(trim = TRUE)
)
})
}
# 最大ページ数を取得してからURLを作る
get_max_page <- function(url) {
page <- read_html(url)
page %>%
html_elements(".pagination-parts a") %>%
html_text(trim = TRUE) %>%
as.integer() %>%
max(na.rm = TRUE)
}
max_page <- get_max_page(base_url)
page_urls <- paste0(base_url, "&page=", 1:max_page)
# 全ページ分まとめて取得
suumo_data <- map_dfr(page_urls, scrape_suumo_page)
# ---------- ここからデータ整形処理を追加 ----------
suumo_data <- suumo_data %>%
mutate(
# 家賃("6.8万円")→ 68000円に変換("-" や "相談" 対応)
rent_value = ifelse(str_detect(rent, "^\\d+(\\.\\d+)?万円$"),
as.numeric(str_remove(rent, "万円")) * 10000,
NA_real_),
# 管理費("2000円", "5000円", "-")→ 数値(円)に変換
management_fee_value = ifelse(str_detect(management_fee, "^\\d+円$"),
as.numeric(str_remove(management_fee, "円")),
0), # "-"や空欄は0円と仮定
# 家賃+管理費の合計
total_cost = rent_value + management_fee_value,
# 面積("25.3m2")を数値に変換
area = as.numeric(str_extract(area, "\\d+(\\.\\d+)?")),
# 徒歩分数("歩3分", "歩10分", …)をすべて抽出して、その中の最小値
walk_from_station = str_extract_all(access, "歩\\d+分"),
walk_from_station = map_int(walk_from_station, function(x) {
mins <- as.integer(str_extract(x, "\\d+"))
if (length(mins) == 0 || all(is.na(mins))) return(NA_integer_)
min(mins, na.rm = TRUE)
}),
# 築年数("築10年" → 10, "新築" → 0)
age_of_building = case_when(
str_detect(age, "新築") ~ 0L,
str_detect(age, "築\\d+年") ~ as.integer(str_extract(age, "\\d+")),
TRUE ~ NA_integer_
)
) %>%
# 駅名と路線名を分離("〇〇線「△△」徒歩x分" に対応)
separate(access, sep = "「", into = c("line", "station"), fill = "right") %>%
separate(station, sep = "」", into = c("station", "drop"), fill = "right") %>%
select(-drop) %>% # ← 一時列を削除
# 不要な元データを削除(rent, management_fee, deposit, key_money)
select(-c(rent, management_fee, deposit, key_money))
# ---------- 整形ここまで ----------
# データ確認
print(head(suumo_data, 5))
print(nrow(suumo_data)) # 件数
# CSVに保存
write.csv(suumo_data, "suumo_kawasaki003.csv", row.names = FALSE, fileEncoding = "UTF-8")
cat("✅ suumo_kawasaki003.csv を保存しました!左のFilesタブからExport→Downloadできます。\n")
plot(suumo_data$area,suumo_data$total_cost)
plot(suumo_data$walk_from_station,suumo_data$total_cost)
plot(suumo_data$age_of_building,suumo_data$total_cost)
plot(suumo_data$area,suumo_data$total_cost)
m<-lm(suumo_data$total_cost~suumo_data$area)
abline(m)
https://gyazo.com/b6afe370637bdfa964a17758510cc6e2
plot(suumo_data$walk_from_station,suumo_data$total_cost)
n<-lm(suumo_data$total_cost~suumo_data$walk_from_station)
abline(n)
https://gyazo.com/1c97ae39bc6aa3382b6bce7b60e6ab3d
plot(suumo_data$age_of_building,suumo_data$total_cost)
l<-lm(suumo_data$total_cost~suumo_data$age_of_building)
abline(l)
https://gyazo.com/b45ccfa4e05e7f30a9118fe73bd1ec3f
model <- lm(total_cost ~ area + walk_from_station + age_of_building, data = suumo_data)
summary(model)
https://gyazo.com/b4b289c3dabb5203e40355f75fbc5a65
25m^2で駅から10分、年数が10年の物件は
39175.82+1970.60*25+531.09*10-571.42*10
88037.52円
ここよりも強気に設定するか、弱気に設定するか。