RSelenium包抓取链家网（上：模拟点击与页面抓取）

安装RSelenium包

# 直接从CRAN下载RSelenium包
install.packages("RSelenium")

启动Selenium服务器

在控制台输入java -jar D:\R\library\Rwebdriver\selenium-server-standalone-3.7.1.jar以启动Selenium服务器。保持打开状态，可配合plantomjs、Chrome或Firefox等浏览器使用
本次案例选择chrome浏览器自动抓取，需要下载相应的浏览器驱动（参考Selenium环境配置第3部分）

思路

页面准备

library(rvest)
library(stringr)
library(RSelenium)
remDr <- remoteDriver(browserName = "chrome")
# 创建一个remoteDriver对象，这一步指定用chrome打开网页，但网页尚未打开
base1 <- "https://hui.lianjia.com/ershoufang/"
base2 <- c("danshui", "huiyangqu", "nanzhanxincheng")
url   <- paste(base1, base2, "/", sep = "")
# 只有淡水、惠阳区、南站新城3个地区，故选择手动拼接url

Step1：封装函数LinkinfoFunc

LinkinfoFunc <- function(remDr, url) {
  result <- data.frame()
  # 建立空数据框result，后用于盛装数据
  remDr$open()
  # 打开chrome浏览器
  for (i in seq_along(url)) {
    remDr$navigate(url[i])
    #驱动浏览器访问第i个地区的首页
    j = 0
    # j是第i个地区的页面计数器，即第i个地区第j页，j的初始值为0
    while (TRUE) {
    # while循环将遍历第i个地区的所有页数
      j = j + 1
      # 开始页面计数，j=1
      destination <- remDr$getPageSource()[[1]] %>% read_html()
      # 获取当前访问页面的内容
      link        <- destination %>% html_nodes("li.clear div.title a") %>% html_attr("href")
      # 获取当前访问页面中的房屋链接（30条/页）
      pageinfo    <- destination %>% html_nodes("div.house-lst-page-box") %>% 
                     html_attr("page-data") %>% str_extract_all(., ":[\\d]+") %>% 
                     unlist() %>% gsub(":", "", .)
      totalpage   <- pageinfo[1]
      curpage     <- pageinfo[2]
      # 从源代码中，提取当前页&总页数的信息
      data        <- data.frame(link, stringsAsFactors = FALSE)
      # 将获取到的房屋链接整合为数据框data
      result      <- rbind(result, data)
      # 将data写入空数据框result
      if (curpage != totalpage) {
      # 若当前页还不是总页数，即尚未达到最后一页
        cat(sprintf("第【%d】个地区第【%d】页抓取成功", i, j), sep = "\n")
        # 则输出“第i个地区第j页（即浏览器当前停留页面）抓取成功”的提示信息
        remDr$executeScript("arguments[0].click();", 
                            list(remDr$findElement("css", "div.house-lst-page-box a.on+a")))
        # 并模拟点击下一页
      } else {
        cat(sprintf("第【%d】个地区第【%d】页抓取成功", i, j), sep = "\n")
        break
      }
    }
    cat(sprintf("第【%d】个地区抓取成功", i), sep = "\n")
    # 输出“第i个地区抓取成功”的提示信息，返回for循环，继续抓取第i+1个地区
  }
  remDr$close()
  # 所有地区所有页面抓取完毕，跳出for循环，关闭浏览器窗口
  cat("All work is done!", sep = "\n")
  return(result)
  # 返回循环叠加后的最终数据
}

Step2：封装函数HouseinfoFunc

HouseinfoFunc <- function(link) {
  result <- data.frame()
  for (i in seq_along(link)) {
    destianation <- read_html(link[i], encoding = "UTF-8")
    # 获取第i条房屋链接的页面内容
    location     <- destianation %>% html_nodes("a.no_resblock_a") %>% html_text()
    # 小区位置
    unit         <- destianation %>% html_nodes(".price span.unit") %>% html_text()
    totalprice   <- destianation %>% html_nodes(".price span.total:nth-child(1)") %>%
                    html_text() %>% paste(., unit, sep = "")
    # 总售价（万）
    downpayment  <- destianation %>% html_nodes(".taxtext span") %>% html_text() %>% .[1]
    # 首付（万）
    persquare    <- destianation %>% html_nodes("span.unitPriceValue") %>% html_text()
    # 每平米售价（元/平米）
    area         <- destianation %>% html_nodes(".area .mainInfo") %>% html_text()
    # 面积大小（平米）
    title        <- destianation %>% html_nodes(".title h1") %>% html_text()
    # 标题
    subtitle     <- destianation %>% html_nodes(".title div.sub") %>% html_text()
    # 副标题
    room         <- destianation %>% html_nodes(".room .mainInfo") %>% html_text()
    # 户型
    floor        <- destianation %>% html_nodes(".room .subInfo") %>% html_text()
    # 楼层
    data         <- data.frame(location, totalprice, downpayment, persquare, 
                               area, title, subtitle, room, floor)
    result       <- rbind(result, data)
    cat(sprintf("第【%d】条房屋链接抓取成功", i), sep = "\n")
  }
  cat("All work is done!", sep = "\n")
  return(result)
}

执行函数

linkinfo  <- LinkinfoFunc(remDr, url) %>% unlist()
# 执行函数LinkinfoFunc，得到linkinfo（list形式）
houseinfo <- HouseinfoFunc(linkinfo)
# 执行函数HouseinfoFunc，得到houseinfo（data.frame形式）
View(houseinfo)
write.table(houseinfo, row.names = FALSE, sep = ",", "houseinfo.csv")
# View()函数查看数据并导出到本地

查看数据

总结

本例使用chrome浏览器打开网页，因此不需要伪造User-Agent。如使用phantomjs无头浏览器，则要伪造一个User-Agent：

myheader <- list(phantomjs.page.settings.userAgent = 
                 "Mozilla/5.0 (Windows NT 6.1; WOW64; 
                 rv:29.0) Gecko/20120101 Firefox/29.0")
remDr    <- remoteDriver(browserName = "phantomjs", extraCapabilities = myheader)

链家网的数据其实并不需要动用Selenium服务器，所有房屋信息在源代码中都能找到。另外，url的变化也十分有规律，最原始的方法是查看各个地区的页数，然后手动构造所有页数，例如，淡水共46页、惠阳区共4页、南站新城共27页：

base <- "https://hui.lianjia.com/ershoufang/"
url1 <- paste0(base, "danshui/pg", 1:46, "/") 
url2 <- paste0(base, "huiyangqu/pg", 1:4, "/") 
url3 <- paste0(base, "nanzhanxincheng/pg", 1:27, "/")
url  <- c(url1, url2, url3)

这里主要用RSelenium包尝试点击下一页的功能，因为各地区总页数不一定总是46、4、27，如果手动构造，容易漏掉或重复抓取。模拟点击可以通过if函数判断当前页是否达到最后一页，yes则停止，no则继续点击。过程中遇到一些错误，并得到迂回解决：

【错误一】自动点击下一页出现问题，有两种解决办法：
1. 一是直接点击“下一页”这个按钮（如果页面有这个按钮的话），直接找到这个按钮的css或xpath
2. 二是点击“当前页+1”（如当前第1页，则直接点击第2页），相邻兄弟选择器（+）：本例a.on是当前页，+a即下一页
【错误二】点击过程中，页面有无关的导航栏挡住“下一页”按钮，导致无法点击并报错，尝试两种办法：
1. 一是换为phantomjs来驱动模拟点击，而非chrome
2. 二是使用函数executeScript执行一个JavaScript片段arguments[0].click();。该函数会将JavaScript片段视为参数来执行，而JavaScript片段可以返回首个（JavaScript中索引值是以0位基准）webElement，并实现点击。
由于基本没有设置拦截，数据抓取速度约1-2条/秒

参考资料：
RSelenium and Javascript
click on next button :rseleniun
左手用R右手Python系列——动态网页抓取与selenium驱动浏览器

RSelenium包抓取链家网（上：模拟点击与页面抓取）

安装RSelenium包

启动Selenium服务器

思路

总结

浏览过的版块