学自《hive编程指南》
hive (default)> create table employees(
> name string,
> salary float,
> subordinates array<string>,
> deductions map<string, float>,
> address struct<street:string, city:string, state:string, zip:int>)
> partitioned by(country string, state string);
hive (default)> load data local inpath "/home/hadoop/workspace/employees.txt"
> overwrite into table employees
> partition(country='US', state='CA');
Loading data to table default.employees partition (country=US, state=CA)
hive (default)> select * from employees;
John Doe 100000.0 ["Mary Smith","Todd Jones"] {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1} {"street":"1 Michigan Ave.","city":"Chicago","state":"IL","zip":60600} US CA
Mary Smith 80000.0 ["Bill King"] {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1} {"street":"100 Ontario St.","city":"Chicago","state":"IL","zip":60601} US CA
Todd Jones 70000.0 [] {"Federal Taxes":0.15,"State Taxes":0.03,"Insurance":0.1} {"street":"200 Chicago Ave.","city":"Oak Park","state":"IL","zip":60700} US CA
Bill King 60000.0 [] {"Federal Taxes":0.15,"State Taxes":0.03,"Insurance":0.1} {"street":"300 Obscure Dr.","city":"Obscuria","state":"IL","zip":60100} US CA
Boss Man 200000.0 ["John Doe","Fred Finance"] {"Federal Taxes":0.3,"State Taxes":0.07,"Insurance":0.05} {"street":"1 Pretentious Drive.","city":"Chicago","state":"IL","zip":60500} US CA
Fred Finance 150000.0 ["Stacy Accountant"] {"Federal Taxes":0.3,"State Taxes":0.07,"Insurance":0.05} {"street":"2 Pretentious Drive.","city":"Chicago","state":"IL","zip":60500} US CA
Stacy Accountant 60000.0 [] {"Federal Taxes":0.15,"State Taxes":0.03,"Insurance":0.1} {"street":"300 Main St.","city":"Naperville","state":"IL","zip":60563} US CA
hive (default)> select name, salary from employees;
hive (default)> select e.name, e.salary from employees e;
John Doe 100000.0
Mary Smith 80000.0
Todd Jones 70000.0
Bill King 60000.0
Boss Man 200000.0
Fred Finance 150000.0
Stacy Accountant 60000.0
[idx]
,不存在为NULL,提取出的字符串也没有引号hive (default)> select e.name, e.subordinates[0] from employees e;
John Doe Mary Smith
Mary Smith Bill King
Todd Jones NULL
Bill King NULL
Boss Man John Doe
Fred Finance Stacy Accountant
Stacy Accountant NULL
[key]
hive (default)> select e.name, e.deductions['State Taxes'] from employees e;
John Doe 0.05
Mary Smith 0.05
Todd Jones 0.03
Bill King 0.03
Boss Man 0.07
Fred Finance 0.07
Stacy Accountant 0.03
.
hive (default)> select e.name, e.address.city from employees e;
John Doe Chicago
Mary Smith Chicago
Todd Jones Oak Park
Bill King Obscuria
Boss Man Chicago
Fred Finance Chicago
Stacy Accountant Naperville
select `price.*` from stocks;
以 price为前缀的列
hive (default)> select upper(name), salary, deductions['Federal Taxes'],
> round(salary*(1-deductions['Federal Taxes'])) from employees;
JOHN DOE 100000.0 0.2 80000.0
MARY SMITH 80000.0 0.2 64000.0
TODD JONES 70000.0 0.15 59500.0
BILL KING 60000.0 0.15 51000.0
BOSS MAN 200000.0 0.3 140000.0
FRED FINANCE 150000.0 0.3 105000.0
STACY ACCOUNTANT 60000.0 0.15 51000.0
select count(*), avg(salary) from employees;
set hive.map.aggr=true; # 可以提高聚合性能,但需要更多内存
select distinct address.city from employees;
# distinct 去重
hive (default)> select explode(subordinates) as sub from employees;
Mary Smith
Todd Jones
Bill King
John Doe
Fred Finance
Stacy Accountant
limit n 返回 n 行
hive (default)> select name, salary,
> case when salary < 50000 then 'low'
> else 'high'
> end as bracket from employees;
John Doe 100000.0 high
Mary Smith 80000.0 high
Todd Jones 70000.0 high
Bill King 60000.0 high
Boss Man 200000.0 high
Fred Finance 150000.0 high
Stacy Accountant 60000.0 high
hive (default)> select name, address.street from employees where address.street like "%Ave.";
OK
John Doe 1 Michigan Ave.
Todd Jones 200 Chicago Ave.
hive (default)> select name, address.street from employees where address.street like "%Chi%";
OK
Todd Jones 200 Chicago Ave.
hive (default)> select name, address.street from employees where address.street rlike ".*(Chicago|Ontario).*";
OK
Mary Smith 100 Ontario St.
Todd Jones 200 Chicago Ave.
多个表 join 把小的表放在左边
hive> select name from employees tablesample(bucket 3 out of 4 on rand());
John Doe
hive> select name from employees tablesample(bucket 3 out of 4 on rand());
Boss Man
Fred Finance
hive> select name from employees tablesample(bucket 3 out of 4 on name);
Mary Smith
Todd Jones
hive> select name from employees tablesample(bucket 3 out of 4 on name);
Mary Smith
Todd Jones
hive> select name from employees tablesample(70 percent);
John Doe
Mary Smith
Todd Jones
Bill King
Boss Man
将多个表进行合并,每个表必须有相同的列,且字段类型一致
hive> select name from(
> select e1.name from employees e1 where e1.name like "Mary%"
> union all
> select e2.name from employees e2 where e2.name like "Bill%"
> ) name_tab
> sort by name;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
Query ID = hadoop_20210411221203_b3dde291-8596-4b91-95e0-707eeaa873f6
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Job running in-process (local Hadoop)
2021-04-11 22:12:04,856 Stage-1 map = 100%, reduce = 100%
Ended Job = job_local1468526053_0003
MapReduce Jobs Launched:
Stage-Stage-1: HDFS Read: 31360 HDFS Write: 0 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
Bill King
Mary Smith