Hive 总结 - - - - - - - - - - - - - - - - - - - - - - - - - - - - 262
概述 - - - - - - - - - - - - - - - - - - - - - - - - - - - - 263
外部表与内部表的区别 - - - - - - - - - - - - - - - - - - - - - 263
特性 - - - - - - - - - - - - - - - - - - - - - - - - - - - - 263
安装Hive - - - - - - - - - - - - - - - - - - - - - - - - - - - 264
Hive的初级应用 - - - - - - - - - - - - - - - - - - - - - - - - 271
Hive 高级应用 - - - - - - - - - - - - - - - - - - - - - - - - 283
保存select查询结果的几种方式 - - - - - - - - - - - - - - - - - 302
JOIN的使用- - - - - - - - - - - - - - - - - - - - - - - - - - - 304
Hive 常用函数的使用 - - - - - - - - - - - - - - - - - - - - - - 306
Hive 参数优化 - - - - - - - - - - - - - - - - - - - - - - - - - 307
Hive Shell 参数的使用 - - - - - - - - - - - - - - - - - - - - - 308
Hive Web图形界面安装 - - - - - - - - - - - - - - - - - - - - - 309
Hive与Tez结合 - - - - - - - - - - - - - - - - - - - - - - - - - 307
Beeline 链接Hive - - - - - - - - - - - - - - - - - - - - - - - 314
DBeaver 4.0 链接Hive 工具 - - - - - - - - - - - - - - - - - - 317
Hive 常见错误 - - - - - - - - - - - - - - - - - - - - - - - - - 320
Hive数据导出与导入方案 - - - - - - - - - - - - - - - - - - - - 321
hive> create table t_test(id int ,name string, password string) row format delimited fields terminated by ',';
OK
Time taken: 0.196 seconds
hive> insert into t_test(id,name,password) values('1','323','sfdf');
Query ID = root_20160924191209_07686e5a-177e-4194-88a5-58536f15698a
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Job running in-process (local Hadoop)
2016-09-24 19:12:13,483 Stage-1 map = 100%, reduce = 0%
Ended Job = job_local956434617_0002
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to: hdfs://hadoop1:9000/user/hive/warehouse/hivetest.db/t_test/.hive-staging_hive_2016-09-24_19-12-09_727_5241291650450368428-1/-ext-10000
Loading data to table hivetest.t_test
Table hivetest.t_test stats: [numFiles=1, numRows=1, totalSize=11, rawDataSize=10]
MapReduce Jobs Launched:
Stage-Stage-1: HDFS Read: 104 HDFS Write: 186 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
Time taken: 4.957 seconds
hive> select * from t_test;
OK
1323sfdf
Time taken: 0.14 seconds, Fetched: 1 row(s)
[root@hadoop1 bin]# hadoop fs -cat /user/hive/warehouse/hivetest.db/t_test/000000_0
1,323,sfdf
注意:在默认的情况下会把表,数据库看做一个文件夹处理,只有数据才会看做文件,默认的事000000——0如果有多个文件则会一次累加。
查看保存本地的数据库:
# hive -help
log4j:WARN No such property [maxFileSize] in org.apache.log4j.DailyRollingFileAppender.
usage: hive
-d,--define <key=value> Variable subsitution to apply to hive
commands. e.g. -d A=B or --define A=B
--database <databasename> Specify the database to use
-e <quoted-query-string> SQL from command line
-f <filename> SQL from files
-H,--help Print help information
--hiveconf <property=value> Use value for given property
--hivevar <key=value> Variable subsitution to apply to hive
commands. e.g. --hivevar A=B
-i <filename> Initialization SQL file
-S,--silent Silent mode in interactive shell
-v,--verbose Verbose mode (echo executed SQL to the
console)
[root@hadoop3 hive]# hive -e "show databases"
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/hadoop-2.6.4/share/hadoop/common/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/spark-1.3.1-bin-hadoop2.6/lib/spark-assembly-1.3.1-hadoop2.6.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/hadoop-2.6.4/share/hadoop/common/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/spark-1.3.1-bin-hadoop2.6/lib/spark-assembly-1.3.1-hadoop2.6.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Logging initialized using configuration in jar:file:/usr/local/hive/lib/hive-common-1.2.1.jar!/hive-log4j.properties
OK
default
hivetest
Time taken: 2.623 seconds, Fetched: 2 row(s)
[root@hadoop3 hive]# hive -S -e "show databases"
在静音的模式下不会显示mapreduce的操作过程
# vi show-databases.sql
show databases;
# hive -S -f show-databases.sql
log4j:WARN No such property [maxFileSize] in org.apache.log4j.DailyRollingFileAppender.
Default
# export TRAINSVAR=trains_var
# hive -S -e "select * from ${TRAINSVAR} limit 10"
****************************
准备数据:
[root@hadoop1 /]# vi /test.bat
1,123,ffgg
2,sfgf,eferg
3,dvf,dvrfg
hive> desc t_test;
OK
id int
name string
password string
Time taken: 0.21 seconds, Fetched: 3 row(s)
[root@hadoop1 /]# hadoop fs -put test.bat /user/hive/warehouse/hivetest.db/t_test
hive> select * from t_test;
OK
1323sfdf
1123ffgg
2sfgfeferg
3dvfdvrfg
Time taken: 0.125 seconds, Fetched: 4 row(s)
[root@hadoop1 /]# hadoop fs -cat /user/hive/warehouse/hivetest.db/t_test/test.bat
1,123,ffgg
2,sfgf,eferg
3,dvf,dvrfg
注意:要上传的数据后缀必须是任意格式的,如果字段的类型与数据库的类型不一致,则会当做NULL来处理,以及多的字段会自动去除,不足的字段也会用NULL来补齐
hive> desc t_test;
OK
id int
name string
password string
Time taken: 0.21 seconds, Fetched: 3 row(s)
[root@hadoop1 /]# vi t_test.bat
4,dsfef,fgrg
8,drfg,fgrg
9,dgr,rgr
10,dgfrg,rgr
11,dgr,rgrgr
12,dsfe,sfe
[root@hadoop1 testData]# ls
test.bat t_test.bat
hive> load data local inpath '/usr/local/hive/testData/t_test.bat' into table t_test;
Loading data to table hivetest.t_test
Table hivetest.t_test stats: [numFiles=3, numRows=0, totalSize=120, rawDataSize=0]
OK
Time taken: 1.059 seconds
[root@hadoop3 testData]# ls
test.bat t_test.bat
[root@hadoop3 testData]# ls
test.bat t_test.bat t_hadoop.bat
[root@hadoop3 testData]# hadoop fs -put t_hadoop.bat /user
HDFS上传到表:
hive> load data inpath '/user/t_hadoop.bat' into table t_test;
Loading data to table hivetest.t_test
Table hivetest.t_test stats: [numFiles=4, numRows=0, totalSize=182, rawDataSize=0]
OK
Time taken: 0.728 seconds
查看HDFS上的文件:
[root@hadoop3 testData]# hadoop fs -ls /user
Found 2 items
drwxr-xr-x - root supergroup 0 2016-09-24 00:46 /user/hive
drwxr-xr-x - root supergroup 0 2016-09-02 05:25 /user/root
注意: 在使用本地上传数据时,会先把数据临时复制一份到HDFS上,目录是有上传的目录一致,等到上传完再把数据移动到指定的目录。在HDFS中则是直接移动文件。其中本地上传需要在inpath前加local关键字
[root@hadoop1 testData]# ls
[root@hadoop1 testData]#
hive> insert overwrite local directory '/usr/local/hive/testData/t_test.text' row format delimited fields terminated by '\t' select * from t_test;
Query ID = root_20160924201517_fda8eced-7f74-4aa1-9b1f-cc6fdc153064
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Job running in-process (local Hadoop)
2016-09-24 20:15:19,324 Stage-1 map = 100%, reduce = 0%
Ended Job = job_local296856771_0005
Copying data to local directory /usr/local/hive/testData/t_test.text
Copying data to local directory /usr/local/hive/testData/t_test.text
MapReduce Jobs Launched:
Stage-Stage-1: HDFS Read: 1274 HDFS Write: 0 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
Time taken: 1.692 seconds
[root@hadoop1 testData]# cd t_test.text/
[root@hadoop1 t_test.text]# ls
000000_0
[root@hadoop3 t_test.text]# cat 000000_0
1323sfdf
345dffefer
78efefeefefeg
1323fegfeefegeg
3545dfdgdgfg
4dsfeffgrg
8drfgfgrg
9dgrrgr
10dgfrgrgr
11dgrrgrgr
12dsfesfe
1123ffgg
2sfgfeferg
3dvfdvrfg
[root@hadoop1 testData]# ls
[root@hadoop1 testData]#
hive> insert overwrite directory '/usr/local/hive/testData/t_test.text' row format delimited fields terminated by '\t' select * from t_test;
[root@hadoop1 testData]# insert table t_test_table select * from t_test;
[root@hadoop1 testDate]# vi common.property
set hivevar:db.default=default;
set hivevar:db.allinfo=allinfo;
set hivevar:db.xiaoxu=xiaoxu;
set hivevar:hdfs.url=hdfs://hadoop1:9000;
set mapred.job.queue.name=pms;
set hive.exec.reducers.max=20;
set mapred.reduce.tasks=200;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=20;
set mapred.job.name=extract_trfc_page_kpi;
Hivevar: 是在内部文件中传参的设置
[root@hadoop1 testDate]# vi textTable .sql
-- 设置MR的task的个数
set mapred.reduce.tasks = 50;
-- 设置HDFS的路径
set hdfs.url=${hivevar:hdfs.url};
-- 设置获取配置文件的初始化数据库的信息
set source.db.name.default=${hivevar:db.default};
set source.db.name.allinfo=${hivevar:db.allinfo};
set out.db.name=${hivevar:db.xiaoxu};
-- 创建数据库
CREATE DATABASE IF NOT EXISTS ${hiveconf:source.db.name.allinfo};
CREATE DATABASE IF NOT EXISTS ${hiveconf:out.db.name};
-- 创建表结构
CREATE TABLE IF NOT EXISTS ${hiveconf:out.db.name}.textTable(
id bigint,
name string,
age int
)
-- 执行逻辑
CREATE TABLE IF NOT EXISTS ${hiveconf:out.db.name}.textTable AS
SELECT ps.id,ps.name,ps.age FROM ${hiveconf:source.db.name.default}.person ps;
-- 使用内存保存数据
WITH memoryData1 AS ( SELECT ps.id,ps.name,ps.age FROM ${hiveconf:out.db.name}.textTable ps WHERE age='23'),
memoryData2 AS ( SELECT ps.id,ps.name,ps.age FROM ${hiveconf:out.db.name}.textTable ps WHERE age='24' )
SELECT * FROM memoryData1 UNION ALL SELECT * FROM memoryData2;
Hiveconf: 是接受内部参数的,与hivevar相配合使用,其中WITH关键字是使用是吧加载的数据放到内存中,建议加载到内存中的数据不要过大。
[root@hadoop1 testDate]# hive -i common.property -f textTable .sql
Logging initialized using configuration in jar:file:/usr/local/hive/lib/hive-common-1.2.1.jar!/hive-log4j.properties
OK
Time taken: 1.642 seconds
OK
Time taken: 0.045 seconds
OK
Time taken: 0.247 seconds
Query ID = root_20170310190113_12f2d1a9-6f30-4991-b09a-11f2cb82b043
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Job running in-process (local Hadoop)
2017-03-10 19:01:19,218 Stage-1 map = 100%, reduce = 0%
Ended Job = job_local546671993_0001
MapReduce Jobs Launched:
Stage-Stage-1: HDFS Read: 130 HDFS Write: 0 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
1xiaozhang23
2xiaowang24
9daye24
Time taken: 6.281 seconds, Fetched: 3 row(s)
以上可以看出执行的过程,map阶段以及reduce阶段的执行情况,其中hive -i是加载配置文件-f是加载SQL文件,详细的请使用hive -help命令查看
[root@hadoop1 testDate]# hive -help
usage: hive
-d,--define <key=value> Variable subsitution to apply to hive
commands. e.g. -d A=B or --define A=B
--database <databasename> Specify the database to use
-e <quoted-query-string> SQL from command line
-f <filename> SQL from files
-H,--help Print help information
--hiveconf <property=value> Use value for given property
--hivevar <key=value> Variable subsitution to apply to hive
commands. e.g. --hivevar A=B
-i <filename> Initialization SQL file
-S,--silent Silent mode in interactive shell
-v,--verbose Verbose mode (echo executed SQL to the
console)
hive> show create table result;
OK
CREATE TABLE `result`(
`id` bigint,
`name` string,
`age` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://hadoop1:9000/user/hive/warehouse/xiaoxu.db/result'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='true',
'numFiles'='1',
'numRows'='10',
'rawDataSize'='120',
'totalSize'='130',
'transient_lastDdlTime'='1489208970')
Time taken: 0.287 seconds, Fetched: 19 row(s)
在以上可以看出result表的结构信息,以及HDFS保存的路径信息。
[root@hadoop1 testDate]# cat hiveVar.sql
set ouput.date=${hivevar:db.test};
select * from ${hiveconf:ouput.date}.person where age=${hivevar:age}
[root@hadoop1 testDate]# hive -i common.property --hivevar age='23' -f hiveVar.sql
****************************
OK
Time taken: 9.516 seconds
OK
1xiaozhang23
Time taken: 1.028 seconds, Fetched: 1 row(s)
[root@hadoop1 testDate]# vi hiveVar.sql
set ouput.date=${hivevar:db.test};
select * from ${hiveconf:ouput.date}.person WHERE AGE BETWEEN ${hivevar:age} AND age<${hivevar:age1};
[root@hadoop1 testDate]# hive -i common.property --hivevar age='10' --hivevar age1='25' -f hiveVar.sql
Time taken: 7.524 seconds
OK
1xiaozhang23
2xiaowang24
3xiaoli25
4xiaoxiao26
5xiaoxiao27
6xiaolizi39
7xiaodaye10
8dageda12
9daye24
10dada25
hive> select * from default.person;
OK
1xiaozhang23
2xiaowang24
3xiaoli25
4xiaoxiao26
5xiaoxiao27
6xiaolizi39
7xiaodaye10
8dageda12
9daye24
10dada25
hive> insert overwrite table default.hiveVar select * from default.person;
hive> select * from default.hiveVar;
OK
1xiaozhang23
2xiaowang24
3xiaoli25
4xiaoxiao26
5xiaoxiao27
6xiaolizi39
7xiaodaye10
8dageda12
9daye24
10dada25
执行上面的insert多次后,查询数据还是保持原来的数据个数,可以看出overwrite是把重复的数据给覆盖掉了。
hive> insert into table default.hiveVar select * from default.person;
hive> select * from default.hiveVar;
OK
1xiaozhang23
2xiaowang24
3xiaoli25
4xiaoxiao26
5xiaoxiao27
6xiaolizi39
7xiaodaye10
8dageda12
9daye24
10dada25
1xiaozhang23
2xiaowang24
3xiaoli25
4xiaoxiao26
5xiaoxiao27
6xiaolizi39
7xiaodaye10
8dageda12
9daye24
10dada25
执行上面的 into 多次后,查询数据是原来的数据个数的插入的次数倍。可以看出into是在尾部追加数据之前的数据还再保留。
insert overwrite 会覆盖已经存在的数据,我们假设要插入的数据和已经存在的N条数据一样,那么插入后只会保留一条数据;
insert into 只是简单的copy插入,不做重复性校验,如果插入前有N条数据和要插入的数据一样,那么插入后会有N+1条数据;
hive> dfs -ls /;
Found 14 items
drwx-wx-wx - root supergroup 0 2016-11-05 02:58 /tmp
drwxr-xr-x - root supergroup 0 2016-11-05 03:14 /user
drwxr-xr-x - root supergroup 0 2016-11-06 08:02 /usr
hive> dfs -ls /user/;
Found 1 items
drwxr-xr-x - root supergroup 0 2016-11-05 03:14 /user/hive