任务一:导入数据
- 创建两个数据表
- Hive QL代码
USE default; -- 选择使用名为 default 的数据库DROP TABLE IF EXISTS ccf_offline_stage1_train; CREATE TABLE ccf_offline_stage1_train (user_id STRING,merchant_id STRING,coupon_id STRING,discount_rate STRING,distance STRING,date_received STRING,`date` STRING ) -- 创建新表,包含7个字段 -- 这里 data 要打 `` 的原因是data是关键字 ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' -- 指定数据文件使用逗号(,)作为字段分隔符(CSV格式) STORED AS TEXTFILE -- 数据以文本文件格式存储 TBLPROPERTIES ("skip.header.line.count"="1" ); -- 表属性设置:跳过外部数据文件的第一行(通常是标题行)DROP TABLE IF EXISTS ccf_online_stage1_train; CREATE TABLE ccf_online_stage1_train (user_id STRING,merchant_id STRING,action STRING,coupon_id STRING,discount_rate STRING,date_received STRING,`date` STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE TBLPROPERTIES ("skip.header.line.count"="1" );LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_offline_stage1_train.csv' -- 从本地(LOCAL)文件系统(不是HDFS)加载文件 OVERWRITE INTO TABLE ccf_offline_stage1_train; -- 覆盖写入新数据到之前的表中LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_online_stage1_train.csv' OVERWRITE INTO TABLE ccf_online_stage1_train; - 执行结果
![image]()
- Hive QL代码
- 查询验证数据加载成功
- Hive QL代码
USE default;SELECT * FROM ccf_offline_stage1_train LIMIT 5; -- 从 ccf_offline_stage1_train 表中查询所有字段(* 表示所有列),只返回前五列 SELECT * FROM ccf_online_stage1_train LIMIT 5; SELECT COUNT(*) FROM ccf_offline_stage1_train; -- 统计 ccf_offline_stage1_train 表中的总行数 SELECT COUNT(*) FROM ccf_online_stage1_train; - 执行结果
- 查询
![image]()
- 计数
![image]()
![image]()
- 查询
- Hive QL代码
任务二:基本数据查询
- 查询⽤户⾏为数量
- Hive QL代码
USE default;SELECTCASE actionWHEN '0' THEN 'CLICK'WHEN '1' THEN 'BUY'WHEN '2' THEN 'RECEIVE'ELSE CONCAT('UNKNOWN_', action)END AS behavior, -- 将原始的动作编码转换为有意义的英文名称COUNT(*) AS total_count -- 计算每种行为出现的总次数 FROM ccf_online_stage1_train -- 从名为 ccf_online_stage1_train 的数据表中读取数据 GROUP BY action -- 按原始的动作编码进行分组统计 ORDER BY total_count DESC; -- 按行为数量从高到低排序 - 举例
- 假设原始数据是:
0,0,1,0,2,1 - 分组之后
组1 (action='0'): 记录1, 记录2, 记录4 组2 (action='1'): 记录3, 记录6 组3 (action='2'): 记录5 - 对每个组分别统计行数
组1: COUNT(*) = 3 组2: COUNT(*) = 2 组3: COUNT(*) = 1
- 假设原始数据是:
- 执行结果
![image]()
- Hive QL代码
- 查询指定商家优惠券使⽤情况
- Hive QL代码
USE default;DROP TABLE IF EXISTS onlineconsumption_table; CREATE TABLE onlineconsumption_table AS -- 这里的AS是创建表的语法关键字,表示"基于查询结果创建表" SELECTmerchant_id,SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NULL THEN 1 ELSE 0 END) AS negative_samples,SUM(CASE WHEN coupon_id IS NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS normal_consumption,SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS positive_samples FROM ccf_online_stage1_train GROUP BY merchant_id;SELECT * FROM onlineconsumption_table LIMIT 20; - 执行结果
![image]()
- Hive QL代码
任务三:数据聚合分析
- 商家周边活跃顾客数量统计
- Hive QL代码
USE default;DROP TABLE IF EXISTS merchant_distance_active_customers; CREATE TABLE merchant_distance_active_customers AS SELECTmerchant_id,NVL(distance, 'UNKNOWN') AS distance_bucket,COUNT(DISTINCT user_id) AS active_customer_count FROM ccf_offline_stage1_train GROUP BY merchant_id, NVL(distance, 'UNKNOWN');SELECTmerchant_id,distance_bucket,active_customer_count FROM merchant_distance_active_customers ORDER BY merchant_id, distance_bucket LIMIT 50; - 执行结果
![image]()
![image]()
其中第一列是商家ID,第二列是距离x,第三列是数目
- Hive QL代码
- 商家正样本比例统计
- Hive QL代码
USE default;DROP TABLE IF EXISTS merchant_positive_ratio; CREATE TABLE merchant_positive_ratio AS SELECTmerchant_id,positive_samples,negative_samples,normal_consumption,(negative_samples + normal_consumption + positive_samples) AS total_samples,CASEWHEN (negative_samples + normal_consumption + positive_samples) > 0 THENpositive_samples / CAST((negative_samples + normal_consumption + positive_samples) AS DOUBLE)ELSE 0.0END AS positive_ratio FROM onlineconsumption_table;SELECTmerchant_id,positive_ratio,positive_samples,total_samples FROM merchant_positive_ratio ORDER BY positive_ratio DESC, positive_samples DESC LIMIT 10; - 执行结果
![image]()
- Hive QL代码
任务四:复杂查询与分析
- 优惠券使用时间统计
- Hive QL代码
USE default;DROP TABLE IF EXISTS coupon_usage_interval_stats; CREATE TABLE coupon_usage_interval_stats AS WITH offline_clean AS (SELECTCASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id,CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed,CASE WHEN date_received IS NOT NULL AND date_received != 'null' THEN date_received ELSE NULL END AS date_receivedFROM ccf_offline_stage1_train ), coupon_usage AS (SELECTcoupon_id,DATEDIFF(TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_consumed, 'yyyyMMdd'))),TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_received, 'yyyyMMdd')))) AS usage_intervalFROM offline_cleanWHERE coupon_id IS NOT NULLAND date_consumed IS NOT NULLAND date_received IS NOT NULL ), usage_counts AS (SELECTcoupon_id,COUNT(*) AS usage_count,AVG(usage_interval) AS avg_usage_intervalFROM coupon_usageGROUP BY coupon_id ), total_usage AS (SELECT SUM(usage_count) AS total_usage_count FROM usage_counts ) SELECTu.coupon_id,u.usage_count,ROUND(u.avg_usage_interval, 2) AS avg_usage_interval FROM usage_counts u CROSS JOIN total_usage t WHERE u.usage_count > t.total_usage_count * 0.01;SELECTcoupon_id,usage_count,avg_usage_interval FROM coupon_usage_interval_stats ORDER BY usage_count DESC, coupon_id LIMIT 100; - 执行结果
![image]()
- Hive QL代码
- 优惠券折扣率统计
- Hive QL代码
USE default;DROP TABLE IF EXISTS coupon_usage_rate_stats; CREATE TABLE coupon_usage_rate_stats AS WITH offline_clean AS (SELECTCASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id,CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed,CASE WHEN discount_rate IS NOT NULL AND discount_rate != 'null' THEN discount_rate ELSE NULL END AS discount_rateFROM ccf_offline_stage1_train ), coupon_totals AS (SELECTcoupon_id,MAX(discount_rate) AS discount_rate,COUNT(*) AS total_count,SUM(CASE WHEN date_consumed IS NOT NULL THEN 1 ELSE 0 END) AS used_countFROM offline_cleanWHERE coupon_id IS NOT NULLGROUP BY coupon_id ), final_stats AS (SELECTcoupon_id,used_count,total_count,CASE WHEN total_count > 0 THEN used_count / CAST(total_count AS DOUBLE) ELSE 0.0 END AS usage_rate,CASEWHEN discount_rate LIKE '%:%' THEN 'threshold'ELSE 'direct'END AS discount_type,CASEWHEN discount_rate LIKE '%:%' THENCASEWHEN CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE) > 0 THEN(CAST(SPLIT(discount_rate, ':')[1] AS DOUBLE) / CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE)) * 100ELSE NULLENDELSE(1 - CAST(discount_rate AS DOUBLE)) * 100END AS discount_percentageFROM coupon_totals ) SELECTcoupon_id,ROUND(usage_rate, 4) AS usage_rate,used_count,total_count,ROUND(discount_percentage, 2) AS discount_percentage_value,discount_type FROM final_stats;SELECTcoupon_id,usage_rate,used_count,CONCAT(CAST(discount_percentage_value AS STRING), '%') AS discount_percentage,discount_type FROM coupon_usage_rate_stats ORDER BY usage_rate DESC, used_count DESC, coupon_id LIMIT 10; - 执行结果
![image]()
- Hive QL代码










