用户行为日志:
为什么要记录用户访问行为日志:
用户行为日志生成渠道:
用户行为日志大致内容:
用户行为日志分析的意义:
离线数据处理流程:
流程示意图:
需求:
日志片段如下:
183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getadv HTTP/1.1" 200 813 "www.xxx.com" "-" cid=0×tamp=1478707261865&uid=2871142&marking=androidbanner&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=f51e97d1cb1a9caac669ea8acc162b96 "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.134.244:80 200 0.027 0.027 10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
首先我们需要根据日志信息抽取出浏览器信息,针对不同的浏览器进行统计操作。虽然可以自己实现这个功能,但是懒得再造轮子了,所以我在GitHub找到了一个小工具可以完成这个功能,GitHub地址如下:
通过git clone或者浏览器下载到本地后,使用命令行进入到其主目录下,然后通过maven命令对其进行打包并安装到本地仓库里:
$ mvn clean package -DskipTest $ mvn clean install -DskipTest
安装完成后,在工程中添加依赖以及插件:
<repositories> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> <releases> <enabled>true</enabled> </releases> <snapshots> <enabled>false</enabled> </snapshots> </repository> </repositories> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <hadoop.version>2.6.0-cdh5.7.0</hadoop.version> </properties> <dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <!-- 添加UserAgent解析的依赖 --> <dependency> <groupId>com.kumkee</groupId> <artifactId>UserAgentParser</artifactId> <version>0.0.1</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.10</version> <scope>test</scope> </dependency> </dependencies> <!-- mvn assembly:assembly --> <build> <plugins> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <archive> <manifest> <mainClass></mainClass> </manifest> </archive> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> </plugin> </plugins> </build>
然后我们编写一个测试用例来测试一下这个解析类,因为之前并没有使用过这个工具,所以对于一个未使用过的工具,要养成在工程中使用之前对其进行测试的好习惯:
package org.zero01.project; import com.kumkee.userAgent.UserAgent; import com.kumkee.userAgent.UserAgentParser; /** * @program: hadoop-train * @description: UserAgent解析测试类 * @author: 01 * @create: 2018-04-01 22:43 **/ public class UserAgentTest { public static void main(String[] args) { String source = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36"; UserAgentParser userAgentParser = new UserAgentParser(); UserAgent agent = userAgentParser.parse(source); String browser = agent.getBrowser(); String engine = agent.getEngine(); String engineVersion = agent.getEngineVersion(); String os = agent.getOs(); String platform = agent.getPlatform(); boolean isMobile = agent.isMobile(); System.out.println("浏览器:" + browser); System.out.println("引擎:" + engine); System.out.println("引擎版本:" + engineVersion); System.out.println("操作系统:" + os); System.out.println("平台:" + platform); System.out.println("是否是移动设备:" + isMobile); } }
控制台输出结果如下:
浏览器:Chrome 引擎:Webkit 引擎版本:537.36 操作系统:Windows 7 平台:Windows 是否是移动设备:false
从打印结果可以看到,UserAgent的相关信息都正常获取到了,我们就可以在工程中进行使用这个工具了。
创建一个类,编写代码如下:
package org.zero01.hadoop.project; import com.kumkee.userAgent.UserAgent; import com.kumkee.userAgent.UserAgentParser; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @program: hadoop-train * @description: 使用MapReduce来完成统计浏览器的访问次数 * @author: 01 * @create: 2018-04-02 14:20 **/ public class LogApp { /** * Map: 读取输入的文件内容 */ public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> { LongWritable one = new LongWritable(1); private UserAgentParser userAgentParser; protected void setup(Context context) throws IOException, InterruptedException { userAgentParser = new UserAgentParser(); } protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 接收到的每一行日志信息 String line = value.toString(); String source = line.substring(getCharacterPosition(line, "\"", 7) + 1); UserAgent agent = userAgentParser.parse(source); String browser = agent.getBrowser(); // 通过上下文把map的处理结果输出 context.write(new Text(browser), one); } protected void cleanup(Context context) throws IOException, InterruptedException { userAgentParser = null; } } /** * Reduce: 归并操作 */ public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> { protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum = 0; for (LongWritable value : values) { // 求key出现的次数总和 sum += value.get(); } // 将最终的统计结果输出 context.write(key, new LongWritable(sum)); } } /** * 获取指定字符串中指定标识的字符串出现的索引位置 * * @param value * @param operator * @param index * @return */ private static int getCharacterPosition(String value, String operator, int index) { Matcher slashMatcher = Pattern.compile(operator).matcher(value); int mIdex = 0; while (slashMatcher.find()) { mIdex++; if (mIdex == index) { break; } } return slashMatcher.start(); } /** * 定义Driver:封装了MapReduce作业的所有信息 */ public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration configuration = new Configuration(); // 准备清理已存在的输出目录 Path outputPath = new Path(args[1]); FileSystem fileSystem = FileSystem.get(configuration); if (fileSystem.exists(outputPath)) { fileSystem.delete(outputPath, true); System.out.println("output file exists, but is has deleted"); } // 创建Job,通过参数设置Job的名称 Job job = Job.getInstance(configuration, "LogApp"); // 设置Job的处理类 job.setJarByClass(LogApp.class); // 设置作业处理的输入路径 FileInputFormat.setInputPaths(job, new Path(args[0])); // 设置map相关参数 job.setMapperClass(LogApp.MyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); // 设置reduce相关参数 job.setReducerClass(LogApp.MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // 设置作业处理完成后的输出路径 FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
在工程目录下打开控制台,输入如下命令进行打包:
mvn assembly:assembly
打包成功:
将这个jar包上传到服务器上:
[root@localhost ~]# rz # 使用的是Xshell工具,所以直接使用rz命令即可上传文件 [root@localhost ~]# ls |grep hadoop-train-1.0-jar-with-dependencies.jar # 查看是否上传成功 hadoop-train-1.0-jar-with-dependencies.jar [root@localhost ~]#
把事先准备好的日志文件上传到HDFS文件系统中:
[root@localhost ~]# hdfs dfs -put ./10000_access.log / [root@localhost ~]# hdfs dfs -ls /10000_access.log -rw-r--r-- 1 root supergroup 2769741 2018-04-02 22:33 /10000_access.log [root@localhost ~]#
执行如下命令
[root@localhost ~]# hadoop jar ./hadoop-train-1.0-jar-with-dependencies.jar org.zero01.hadoop.project.LogApp /10000_access.log /browserout
执行成功:
查看处理结果:
[root@localhost ~]# hdfs dfs -ls /browserout Found 2 items -rw-r--r-- 1 root supergroup 0 2018-04-02 22:42 /browserout/_SUCCESS -rw-r--r-- 1 root supergroup 56 2018-04-02 22:42 /browserout/part-r-00000 [root@localhost ~]# hdfs dfs -text /browserout/part-r-00000 Chrome 2775 Firefox 327 MSIE 78 Safari 115 Unknown 6705 [root@localhost ~]#
本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。
我来说两句