2017年11月1日课后作业
SQL -> MapReduce
给非Java编程者对HDFS上的数据做MapReduce查询使用
hive-site.xml 我们配置hive.metastore.warehouse.dir
包括,表的名字,字段的名称、类型 还需要存,数据放在HDFS哪个目录上了
解释器、编译器、优化器
下载、解压 配置环境变量 修改配置文件 启动 hive
安装mysql 设置mysql的权限 配置环境变量 修改配置文件 启动 hive
两种 clinte和Thrift Server 放在一起的 安装mysql 设置mysql的权限 配置环境变量 修改配置文件 启动 hive --service metastore 启动 hive
Hive Data Definition Language
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
Hive Data Manipulation Language https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML
Hive Data Definition Language
Create/Drop/Alter/Use Database Create/Drop/Truncate Table Alter Table/Partition/Column Create/Drop/Alter View Create/Drop/Alter Index Create/Drop Macro Create/Drop/Reload Function Create/Drop/Grant/Revoke Roles and Privileges
Drop Database 报错信息 FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. InvalidOperationException(message:Database test is not empty. One or more tables exist.)
create table tbltest (id int, name string);
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive 0.14.0 and later) [(col_name data_type [COMMENT col_comment], ... [constraint_specification])] [COMMENT table_comment] [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] [SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive 0.10.0 and later)] ON ((col_value, col_value, ...), (col_value, col_value, ...), ...) [STORED AS DIRECTORIES] [ [ROW FORMAT row_format] [STORED AS file_format] | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive 0.6.0 and later) ] [LOCATION hdfs_path] [TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive 0.6.0 and later) [AS select_statement]; -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name LIKE existing_table_or_view_name [LOCATION hdfs_path];
data_type : primitive_type | array_type | map_type | struct_type | union_type -- (Note: Available in Hive 0.7.0 and later)
primitive_type : TINYINT | SMALLINT | INT | BIGINT | BOOLEAN | FLOAT | DOUBLE | DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later) | STRING | BINARY -- (Note: Available in Hive 0.8.0 and later) | TIMESTAMP -- (Note: Available in Hive 0.8.0 and later) | DECIMAL -- (Note: Available in Hive 0.11.0 and later) | DECIMAL(precision, scale) -- (Note: Available in Hive 0.13.0 and later) | DATE -- (Note: Available in Hive 0.12.0 and later) | VARCHAR -- (Note: Available in Hive 0.12.0 and later) | CHAR -- (Note: Available in Hive 0.13.0 and later)
array_type : ARRAY < data_type >
map_type : MAP < primitive_type, data_type >
struct_type : STRUCT < col_name : data_type [COMMENT col_comment], ...>
union_type : UNIONTYPE < data_type, data_type, ... > -- (Note: Available in Hive 0.7.0 and later)
row_format : DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char] [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char] [NULL DEFINED AS char] -- (Note: Available in Hive 0.13 and later) | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
file_format: : SEQUENCEFILE | TEXTFILE -- (Default, depending on hive.default.fileformat configuration) | RCFILE -- (Note: Available in Hive 0.6.0 and later) | ORC -- (Note: Available in Hive 0.11.0 and later) | PARQUET -- (Note: Available in Hive 0.13.0 and later) | AVRO -- (Note: Available in Hive 0.14.0 and later) | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
constraint_specification: : [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE ] [, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE
1,xiaoming,read-game-code,shanghai:pudong-beijing:chaoyang 2,lilei,read-tv-friend,huoxing:xxx-beijing:haidian 3,hanmeimei,lilei-read,huoxing:xxx-beijing:haidian
1,xiaoming,hobby,address
1|xiaoming|hobby,address
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive 0.14.0 and later) [(col_name data_type [COMMENT col_comment], ... [constraint_specification])]
CREATE TABLE IF NOT EXISTS psn1 ( id int, name string, hobby ARRAY < string >, address MAP < string, string > )ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY '-' MAP KEYS TERMINATED BY ':';
desc psn1;
FAILED: ParseException line 7:1 missing EOF at 'DELIMITED' near ')' FAILED: ParseException line 2:0 cannot recognize input near 'address' 'MAP' '<' in create table statement
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
LOAD DATA LOCAL INPATH '/root/data' INTO TABLE psn1;
select * from psn1
CREATE [TEMPORARY] [EXTERNAL] TABLE
CREATE EXTERNAL TABLE IF NOT EXISTS psn2 ( id int, name string, hobby ARRAY < string >, address MAP < string, string > )ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY '-' MAP KEYS TERMINATED BY ':' LOCATION '/user/psn2' ;
LOAD DATA LOCAL INPATH '/root/data' INTO TABLE psn2;
CREATE TEMPORARY TABLE psn3 ( id int, name string, hobby ARRAY < string >, address MAP < string, string > )ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY '-' MAP KEYS TERMINATED BY ':';
LOAD DATA LOCAL INPATH '/root/data' INTO TABLE psn2;
管理元数据和HDFS上的数据 如果drop table ,hdfs上的数据也会被删除
只管理元数据信息,drop表后,hdfs上的数据不会被删除 关键词是EXTERNAL 需要配置LOCATION的信息,数据文件会存储在配置的路径,不会加上table_name
只对当前的Seession有效的,如果当前session退出的话,表自动删除了 临时表优先级是高于其他表的(如果我创建一个同名的临时表,查询的数据就是这个临时表) 关键词是TEMPORARY
ALTER TABLE table_name RENAME TO new_table_name;
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name LIKE existing_table_or_view_name [LOCATION hdfs_path];
CREATE TABLE psn3 LIKE psn1;
CREATE TABLE psn4 AS SELECT * FROM psn1;
CTL 只创建空表结构
CTAS 创建表结构,用MapReduce将数据转到hive下面来管理,数据格式,HIve的分隔符
Hive Data Manipulation Language
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
LOAD DATA INPATH '/user/root/data1' INTO TABLE psn1;
总结一下,hdfs上的数据如果load进到hive里面 他实际上是把 data1这个文件从原来的位置,mv到 hive的目录下面
千万别没事 Drop表完,服务器存储空间有的是
Standard syntax: INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 FROM from_statement; INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement;
Hive extension (multiple inserts):
FROM from_statement INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...; FROM from_statement INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] ...;
Hive extension (dynamic partition inserts): INSERT OVERWRITE TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement; INSERT INTO TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
INSERT OVERWRITE TABLE select_statement1 FROM from_statement;
INSERT OVERWRITE TABLE ps2 SELECT * FROM psn1;
INSERT INTO TABLE select_statement1 FROM from_statement;
INSERT INTO TABLE ps2 SELECT * FROM psn1;
FROM from_statement INSERT INTO TABLE tablename select_statement
FROM psn1 INSERT INTO TABLE result select 1,count(id) select 2,count(name)
INSERT INTO TABLE result select 1,count(id) FROM psn1
CREATE TABLE students (name VARCHAR(64), age INT, gpa DECIMAL(3, 2));
INSERT INTO TABLE students VALUES ('fred flintstone', 35, 1.28), ('barney rubble', 32, 2.32);