我正在尝试创建一个算法,以递归和功能的方式进行web爬行。我知道如何使用循环、var变量并在其上进行累积。但我很难递归地去做。
有关我的代码的一些问题: 1.为什么def loop
返回Any
?2.表单http://..../example.zip中有一些getLinksPage抛出异常、返回None并中断循环的URL。如何处理它? 3.如何使用Scala框架测试测试这段代码?
def getLinksPage(urlToCrawl: String): Option[List[String]] = {
try {
val conn = Jsoup.connect(urlToCrawl)
val doc = conn.get()
val elements = doc.select("a[href]")
val elementsSc = elements.asScala
val links = elementsSc.map(_.attr("abs:href")).toSeq
val linksURL = links.map(new URL(_))
val tartgetURL = (new URL(urlToCrawl)).getHost
val linksLocalURL = linksURL.filter(_.getHost == tartgetURL).map(_.toString).toList
Some(linksLocalURL)
}
catch {
case e: Exception => None
}
}
def loop(l:Option[List[String]], acc: List[String]): Any = l match {
case Some(Nil) => acc
case Some(hd::tl) => if (!acc.contains(hd)) loop(getLinksPage(hd),hd::acc)
else loop(Option(tl), acc)
case None => acc
}
loop(getLinksPage(mainURL), List(mainURL))
发布于 2017-10-20 17:15:13
List[String]
for comprehension
应该有助于这一点。同样为了简单起见,可以考虑只返回一个List
,而不是使用List[String].empty
返回一个Option[List]
。conn
实例,该特性将允许您覆盖值或更改函数以接受隐式conn
,然后单元测试可以模拟它。编辑
下面是一个spitball示例,说明如何使用getLinksPage
和loop
作为独立单元使用ScalaTest
测试您的ScalaTest
函数。免责声明:语法可能不是100%;根据需要进行调整。
case class Crawler() {
def getConnection(url: String) = Jsoup.connect(url)
def getLinksPage(urlToCrawl: String): Option[List[String]] = {
val conn = getConnection(urlToCrawl)
...
}
}
class CrawerSpec extends WordSpec with MockFactory {
trait LinksFixture {
val connection = mock[Connection]
val getConnection = mockFunction[String, Connection]
lazy val crawler = new Crawler() {
override def getConnection(url: String) = LinksFixture.this.getConnection(url)
}
}
trait LoopFixture {
val getLinksPage = mock[String, Option[List[String]]]
lazy val crawler = new Crawler() {
override def getLinksPage(url: String) = LoopFixture.this.getLinksPage(url)
}
}
"getLinksPage" should {
"return the links" in new LinksFixture {
val url = "http://bad-wolf"
getConnection expects(url) returning connection
// add other expects on connection
crawler.getLinksPage(url) shouldBe expected // define expected
}
}
"loop" should {
"loop over the links" in new LoopFixture {
getLinksPage expects(*) onCall {
_ match {
case "a" => Some(List("b","c"))
case "b" => Some(List("d"))
case _ => None
}
}
// add any other expects
crawler.loop(Some(List("a")), List.empty[String]) shouldBe // define expected
}
}
}
https://stackoverflow.com/questions/46852052
复制相似问题