上海高端网站开发公安邦消防安全技术服务有限公司
news/
2025/10/1 20:32:01/
文章来源:
上海高端网站开发公,安邦消防安全技术服务有限公司,建网站商城平台,申请网站网站Hive是一个数据仓库基础的应用工具#xff0c;在Hadoop中用来处理结构化数据#xff0c;它架构在Hadoop之上#xff0c;通过SQL来对数据进行操作#xff0c;了解SQL的人#xff0c;学起来毫不费力。Hive 查询操作过程严格遵守Hadoop MapReduce 的作业执行模型#xff0c;… Hive是一个数据仓库基础的应用工具在Hadoop中用来处理结构化数据它架构在Hadoop之上通过SQL来对数据进行操作了解SQL的人学起来毫不费力。Hive 查询操作过程严格遵守Hadoop MapReduce 的作业执行模型Hive 将用户的Hive SQL 语句通过解释器转换为MapReduce 作业提交到Hadoop 集群上Hadoop 监控作业执行过程然后返回作业执行结果给用户。Hive 并非为联机事务处理而设计Hive 并不提供实时的查询和基于行级的数据更新操作。Hive 的最佳使用场合是大数据集的批处理作业例如网络日志分析。下面我们就为大家总结了一些Hive的常用 SQL语法[ ]括起来的代表我们可以写也可以不写的语句。创建数据库CREATE DATABASE name;显示命令show tables;show databases;show partitions ;show functions;describe extended table_name dot col_name;DDL(Data Defination Language)数据库定义语言建表CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name [(col_name data_type [COMMENT col_comment], ...)] [COMMENT table_comment] [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] [ROW FORMAT row_format] [STORED AS file_format] [LOCATION hdfs_path]CREATE TABLE 创建一个指定名字的表。如果相同名字的表已经存在则抛出异常用户可以用 IF NOT EXIST 选项来忽略这个异常EXTERNAL 关键字可以让用户创建一个外部表在建表的同时指定一个指向实际数据的路径(LOCATION)LIKE 允许用户复制现有的表结构但是不复制数据COMMENT可以为表与字段增加描述ROW FORMATDELIMITED [FIELDS TERMINATED BY char] [COLLECTION ITEMS TERMINATED BY char] [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char] | SERDE serde_name [WITH SERDEPROPERTIES (property_nameproperty_value, property_nameproperty_value, ...)]STORED ASSEQUENCEFILE | TEXTFILE | RCFILE | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname如果文件数据是纯文本可以使用 STORED AS TEXTFILE。如果数据需要压缩使用 STORED AS SEQUENCE 。创建简单表CREATE TABLE person(name STRING,age INT); 创建外部表:CREATE EXTERNAL TABLE page_view(viewTime INT, userid BIGINT, page_url STRING, referrer_url STRING, ip STRING COMMENT IP Address of the User, country STRING COMMENT country of origination) COMMENT 这里写表的描述信息 ROW FORMAT DELIMITED FIELDS TERMINATED BY \054 STORED AS TEXTFILE LOCATION ;创建分区表CREATE TABLE par_table(viewTime INT, userid BIGINT, page_url STRING, referrer_url STRING, ip STRING COMMENT IP Address of the User) COMMENT This is the page view table PARTITIONED BY(date STRING, pos STRING)ROW FORMAT DELIMITED ‘\t’ FIELDS TERMINATED BY \nSTORED AS SEQUENCEFILE;创建分桶表CREATE TABLE par_table(viewTime INT, userid BIGINT, page_url STRING, referrer_url STRING, ip STRING COMMENT IP Address of the User) COMMENT This is the page view table PARTITIONED BY(date STRING, pos STRING) CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS ROW FORMAT DELIMITED ‘\t’ FIELDS TERMINATED BY \nSTORED AS SEQUENCEFILE;创建带索引字段的表CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (dindex STRING); 复制一个空表CREATE TABLE empty_key_value_storeLIKE key_value_store;显示所有表SHOW TABLES;按正则表达式显示表hive SHOW TABLES .*s;表中添加一个字段ALTER TABLE pokes ADD COLUMNS (new_col INT);添加一个字段并为其添加注释hive ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT a comment);删除列hive ALTER TABLE test REPLACE COLUMNS(id BIGINT, name STRING);更改表名hive ALTER TABLE events RENAME TO 3koobecaf;增加、删除分区#增加ALTER TABLE table_name ADD [IF NOT EXISTS] partition_spec [ LOCATION location1 ] partition_spec [ LOCATION location2 ] ... partition_spec: : PARTITION (partition_col partition_col_value, partition_col partiton_col_value, ...)#删除ALTER TABLE table_name DROP partition_spec, partition_spec,...改变表的文件格式与组织ALTER TABLE table_name SET FILEFORMAT file_formatALTER TABLE table_name CLUSTERED BY(userid) SORTED BY(viewTime) INTO num_buckets BUCKETS#这个命令修改了表的物理存储属性创建和删除视图#创建视图CREATE VIEW [IF NOT EXISTS] view_name [ (column_name [COMMENT column_comment], ...) ][COMMENT view_comment][TBLPROPERTIES (property_name property_value, ...)] AS SELECT#删除视图DROP VIEW view_nameDML(Data manipulation language):数据操作语言主要是数据库增删改三种操作DML包括INSERT插入、UPDATE更新、DELETE删除。向数据表内加载文件LOAD DATA [LOCAL] INPATH filepath [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1val1, partcol2val2 ...)]#load操作只是单纯的复制/移动操作将数据文件移动到Hive表对应的位置。#加载本地LOAD DATA LOCAL INPATH ./examples/files/kv1.txt OVERWRITE INTO TABLE pokes;#加载HDFS数据同时给定分区信息hive LOAD DATA INPATH /user/myname/kv2.txt OVERWRITE INTO TABLE invites PARTITION (ds2008-08-15);将查询结果插入到Hive表INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1val1, partcol2val2 ...)] select_statement1 FROM from_statement;#多插入模式FROM from_statementINSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1val1, partcol2val2 ...)] select_statement1[INSERT OVERWRITE TABLE tablename2 [PARTITION ...] select_statement2] ...#自动分区模式INSERT OVERWRITE TABLE tablename PARTITION (partcol1[val1], partcol2[val2] ...) select_statement FROM from_statement;将查询结果插入到HDFS文件系统中INSERT OVERWRITE [LOCAL] DIRECTORY directory1 SELECT ... FROM ... FROM from_statement INSERT OVERWRITE [LOCAL] DIRECTORY directory1 select_statement1 [INSERT OVERWRITE [LOCAL] DIRECTORY directory2 select_statement2]INSERT INTOINSERT INTO TABLE tablename1 [PARTITION (partcol1val1, partcol2val2 ...)] select_statement1 FROM from_statement;insert overwrite和insert into的区别insert overwrite 会覆盖已经存在的数据假如原始表使用overwrite 上述的数据先现将原始表的数据remove再插入新数据。insert into 只是简单的插入不考虑原始表的数据直接追加到表中。最后表的数据是原始数据和新插入数据。DQL(data query language)数据查询语言 select操作SELECT查询结构SELECT [ALL | DISTINCT] select_expr, select_expr, ...FROM table_reference[WHERE where_condition][GROUP BY col_list [HAVING condition]][ CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY| ORDER BY col_list]][LIMIT number]使用ALL和DISTINCT选项区分对重复记录的处理。默认是ALL表示查询所有记录DISTINCT表示去掉重复的记录Where 条件 类似我们传统SQL的where 条件ORDER BY 全局排序只有一个Reduce任务SORT BY 只在本机做排序LIMIT限制输出的个数和输出起始位置将查询数据输出至目录hive INSERT OVERWRITE DIRECTORY /tmp/hdfs_out SELECT a.* FROM invites a WHERE a.ds;将查询结果输出至本地目录hive INSERT OVERWRITE LOCAL DIRECTORY /tmp/local_out SELECT a.* FROM pokes a;将一个表的结果插入到另一个表FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(1) WHERE a.foo 0 GROUP BY a.bar;INSERT OVERWRITE TABLE events SELECT a.bar, count(1) FROM invites a WHERE a.foo 0 GROUP BY a.bar;JOINFROM pokes t1 JOIN invites t2 ON (t1.bar t2.bar) INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo;将多表数据插入到同一表中FROM srcINSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key 100INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key 100 and src.key 200INSERT OVERWRITE TABLE dest3 PARTITION(ds2008-04-08, hr12) SELECT src.key WHERE src.key 200 and src.key 300INSERT OVERWRITE LOCAL DIRECTORY /tmp/dest4.out SELECT src.value WHERE src.key 300;Hive 只支持等值连接(equality joins)、外连接(outer joins)和(left semi joins)。Hive 不支持所有非等值的连接因为非等值连接非常难转化到 map/reduce 任务。LEFTRIGHT和FULL OUTER关键字用于处理join中空记录的情况LEFT SEMI JOIN 是 IN/EXISTS 子查询的一种更高效的实现join 时每次 map/reduce 任务的逻辑是这样的reducer 会缓存 join 序列中除了最后一个表的所有表的记录再通过最后一个表将结果序列化到文件系统实际应用过程中应尽量使用小表join大表join查询时应注意的点#只支持等值连接SELECT a.* FROM a JOIN b ON (a.id b.id)SELECT a.* FROM a JOIN b ON (a.id b.id AND a.department b.department)#可以 join 多个表SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key b.key1) JOIN c ON (c.key b.key2)#如果join中多个表的 join key 是同一个则 join 会被转化为单个 map/reduce 任务LEFTRIGHT和FULL OUTER关键字#左外连接SELECT a.val, b.val FROM a LEFT OUTER JOIN b ON (a.keyb.key)#右外链接SELECT a.val, b.val FROM a RIGHT OUTER JOIN b ON (a.keyb.key)#满外连接SELECT a.val, b.val FROM a FULL OUTER JOIN b ON (a.keyb.key)LEFT SEMI JOIN关键字#LEFT SEMI JOIN 的限制是 JOIN 子句中右边的表只能在 ON 子句中设置过滤条件在 WHERE 子句、SELECT 子句或其他地方过滤都不行SELECT a.key, a.value FROM a WHERE a.key in (SELECT b.key FROM B);#可以被写为SELECT a.key, a.val FROM a LEFT SEMI JOIN b on (a.key b.key)UNION 与 UNION ALL#用来合并多个select的查询结果需要保证select中字段须一致select_statement UNION ALL select_statement UNION ALL select_statement ...#UNION 和 UNION ALL的区别#UNION只会查询到两个表中不同的数据相同的部分不会被查出#UNION ALL会把两个表的所有数据都查询出
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/924247.shtml
如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!