shell脚本--练习1(爬虫)

系统环境 

[root@m01 scripts]# uname -r
2.6.32-696.el6.x86_64
[root@m01 scripts]# uname -m
x86_64
[root@m01 scripts]# cat /etc/redhat-release 
CentOS release 6.9 (Final)

 shell练习1

#!/bin/bash
# date: 2018-03-xx
# author: yk
# descrption: Climbing 51cto data
# version: 0.1

source /etc/profile
. /etc/init.d/functions

# Create a temporary file 
TmpFile="/tmp/.$(date +%Y%m%d_%H%M%S).log.tmp"
touch $TmpFile
# Store web page information
BlogFile="/tmp/$(date +%Y%m%d_%H%M%S)_blog.html"
touch $BlogFile

# Let the user enter the 51cto blogger's homepage URL
read -p 'please input websitei' Website
# Climb 51cto blogger home
wget -q -O $TmpFile $Website &>/dev/null
[ $? -ne 0 ] && echo "you input website is not exist" && exit 1

# Blogger's last page blog. That is, the last page contains the number of pages
MainURL=$(sed -n '/class="last".*末页.*/p' $TmpFile | egrep -o 'http:.*p[0-9]{1,}')

# 28 pages
Pages=$(echo $MainURL | sed -n 's#^.*p##gp')

# If it is not the home page, the number of extracted pages is definitely not a number
if [ "$Pages" -gt 0 ] &>/dev/null
then
	echo "please wait ......"
else
	echo "you input url is not homepage"
	rm -f $TmpFile
	rm -f $BlogFile
	exit 1
fi



# Url address, in addition to the last number
UR=$(echo $MainURL | sed -rn 's#[0-9]{1,}$##gp')

# Traverse every page
for ((i=1;i<=$Pages;i++))
do
	# Splice together, which is the complete blogger's website
	wget -q -O $TmpFile ${UR}$i &>/dev/null
	# Get time, title, link
	egrep -A 1 '<a class="tit" | class="time' $TmpFile | sed '/^\-\-/d' | sed -r 's#[ ]+# #g'   >>$BlogFile
	# Pause 0.05 seconds, not too fast
	sleep 0.05   
done

# clear tmp file
>$TmpFile


# ===============================================================
action "The blogger’s blog information has been downloaded locally" /bin/true
echo "Extracting required data from downloaded data ......"
echo "please wait ....."
# ===============================================================


i=0
# Extract the desired data for each line of the file
while read line
do
	# Because every 4th line is the content of a blog, it only needs to extract from every 4th line and loop execution.
	((++i))
	case "$i" in
		1)
			# Get blog posting time
			Time=$(echo $line | sed -r 's#^.*>发布于:(.*)</a>#\1#g')
			;;
		3)
			# get href 
			Href=$(echo $line | sed -r 's#^.*href=\"(.*)\">#\1#g')
			;;
		4)
			# get blog title
			Title=$(echo $line | sed -r 's#^(.*)<.*$#\1#g')
			;;
		*)
	esac
	# Every 4 acts as a blog, appends the acquired information to a temporary file
	if [ "$i" -eq "4" ]
	then
		i=0
		echo "<a href=\"$Href\">$Time---$Title</a><br/>" >> $TmpFile
	fi
done < $BlogFile
# clear file
>$BlogFile
# Sort by time , Append to file $BlogFile
cat $TmpFile | sort -rt '>' -k2 >>$BlogFile 
rm -f $TmpFile

action "success" /bin/true

注:仅供参考

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

编辑于

我来说两句

0 条评论
登录 后参与评论

相关文章

来自专栏技术总结

献给移动端的服务器搭建

application.properties这个是项目的一些配置,举例一下默认是8080端口,我们如果想改下端口的话,就可以在配置增加

2262
来自专栏好好学java的技术栈

SpringMVC+RestFul详细示例实战教程(实现跨域访问)

**REST(Representational State Transfer)**,中文翻译叫“表述性状态转移”。是 Roy Thomas Fielding 在...

2044
来自专栏一个会写诗的程序员的博客

13.7 SpringBoot集成日志系统logback的几个问题问题1: Logging system failed to initialize using configuration from

让人感到疑惑的是,SpringBoot居然没有对application.properties配置文件value末端作空格trim处理。

3701
来自专栏菩提树下的杨过

weblogic 10.x 上开发restful服务

之前已经学习过 利用JAX-RS快速开发RESTful 服务,当时是jboss环境,如果原封不动的迁移到weblogic 10.x 版本,会杯具的发现应用启动失...

22510
来自专栏Netkiller

怎样制作RPM包

怎样制作RPM包 摘要 我在网上找RPM包的制作例子几乎都是C源码编译安装然后生成RPM包, 而我的程序不是C写的很多时候是脚本语言如Python, PHP 甚...

6566
来自专栏软件开发

Spring MVC 学习总结(十一)——IDEA+Maven+多模块实现SSM框架集成

与SSH(Struts/Spring/Hibernate/)一样,Spring+SpringMVC+MyBatis也有一个简称SSM,Spring实现业务对象管...

2352
来自专栏Objective-C

iOS-安装和使用 CocoaPods

4367
来自专栏用户2442861的专栏

cmake教程4(find_package使用)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/haluoluo211/article/d...

3073
来自专栏菩提树下的杨过

mac:在当前文件夹打开terminal终端

System Preferences -> Keyboard -> Shortcuts -> Services -> New Terminal at Folde...

2268
来自专栏一个会写诗的程序员的博客

Spring Boot 集成 WebFlux 开发 Reactive Web 应用Spring Boot 集成 WebFlux 开发 Reactive Web 应用

IBM的研究称,整个人类文明所获得的全部数据中,有90%是过去两年内产生的。在此背景下,包括NoSQL,Hadoop, Spark, Storm, Kylin在...

1352

扫码关注云+社区

领取腾讯云代金券