数据处理 Hive Sql

Hive Sql
Hive是一个在Hadoop中用来处理结构化数据的数据仓库基础工具。它架构在Hadoop之上,用来进行数据提取、转化、加载,这是一种可以存储、查询和分析存储在Hadoop中的大规模数据的机制。hive数据仓库工具能将结构化的数据文件映射为一张数据库表,并提供SQL查询功能,能将SQL语句转变成MapReduce任务来执行。
术语“大数据”是大型数据集,其中包括体积庞大,高速,以及各种由与日俱增的数据的集合。使用传统的数据管理系统难以加工大型数据。因此,Apache软件基金会推出了一款名为Hadoop的解决大数据管理和处理难题的框架。

官方文档: https://www.docs4dev.com/docs/zh/apache-hive/3.1.1/reference/LanguageManual_DML.html

建表语句样例

1
2
3
4
5
6
7
8
DROP TABLE IF EXISTS dev.trans_temp;
CREATE TABLE dev.trans_temp(
account_id bigint,
trans_id int,
product_id string,
sales float
);
insert into dev.trans_temp

时间格式化

1
2
year(place_time) as order_year
month(place_time) as order_month

计算两时间之间的时间差

1
datediff(current_date,last_purchase_date)

求recency

1
datediff(current_date,max(to_date(place_time))) as recency

时间筛选

1
2
3
4
-- 最近5天
where to_date(sap_time) between date_sub(current_date,5) and date_sub(current_date,1)
-- 最近30天
where to_date(dt) >= date_sub(current_date,30)

判断某订单是否在大促期间购买

1
2
3
4
5
6
7
8
9
-- 即订单发生时间在大促开始之后 大促结束之前
case when ((unix_timestamp(place_time)-unix_timestamp(promotion_start_time))/3600) <= promotion_hours
and ((unix_timestamp(place_time)-unix_timestamp(promotion_start_time))/3600) >=0 then '大促时购买' else null end as promotion_behavior


,case when array_contains(collect_set(promotion_behavior) over (partition by sales_member_id,sales_order_number),'大促时购买') then 'private' else 'normal' end as promotion_behavior

,case when array_contains(collect_set(promotion_behavior),'private') and !array_contains(collect_set(promotion_behavior),'normal') then '只在大促购买'
when array_contains(collect_set(promotion_behavior),'normal') and !array_contains(collect_set(promotion_behavior),'private') then '只在非大促购买' else '大促及非大促都购买' end as private_sale_sensitivity

节日

1
2
3
4
where to_date(place_time) between date_sub(concat(year(current_date),'-','01-01'),7) and date_add(concat(year(current_date),'-','01-01'),7)--New Year
or to_date(place_time) between date_sub(concat(year(current_date),'-','02-14'),7) and date_add(concat(year(current_date),'-','02-14'),7)--Valentine
or to_date(place_time) between date_sub(concat(year(current_date),'-','{Qixi}'),7) and date_add(concat(year(current_date),'-','{Qixi}'),7)--Qixi
or to_date(place_time) between date_sub(concat(year(current_date)-1,'-','12-25'),7) and date_add(concat(year(current_date)-1,'-','12-25'),7)--Christmas

case when 判断时间

1
2
3
4
5
case hour(sap_time) when 0 then '[0-1)' when 1 then '[1-2)' when 2 then '[2-3)' when 3 then '[3-4)' when 4 then '[4-5)'
when 5 then '[5-6)' when 6 then '[6-7)' when 7 then '[7-8)' when 8 then '[8-9)' when 9 then '[9-10)'
when 10 then '[10-11)' when 11 then '[11-12)' when 12 then '[12-13)' when 13 then '[13-14)' when 14 then '[14-15)'
when 15 then '[15-16)' when 16 then '[16-17)' when 17 then '[17-18)' when 18 then '[18-19)' when 19 then '[19-20)'
when 20 then '[20-21)' when 21 then '[21-22)' when 22 then '[22-23)' when 23 then '[23-0)' end as hour_name

hive 判断星期几

1
2
3
4
5
6
7
case when pmod(datediff(place_time, '1920-01-01') - 3, 7) = 1 then 'Monday'
when pmod(datediff(place_time, '1920-01-01') - 3, 7) = 2 then 'Tuesday'
when pmod(datediff(place_time, '1920-01-01') - 3, 7) = 3 then 'Wednesday'
when pmod(datediff(place_time, '1920-01-01') - 3, 7) = 4 then 'Thursday'
when pmod(datediff(place_time, '1920-01-01') - 3, 7) = 5 then 'Friday'
when pmod(datediff(place_time, '1920-01-01') - 3, 7) = 6 then 'Saturday'
else 'Sunday' end as wd_name

排序

1
2
3
4
,row_number() over(partition by account_id order by sap_time desc) as rn
,row_number() over(partition by account_id order by sales desc) as rn
,case when casales=max(casales) over(partition by sales_member_id) then category else null end preferred_category --偏好
,sort_array(array(seasonal_share,festival_share,promotion_share,gifting_share,exclusive_share)) as sort_arr

返回第一个非空值

1
,COALESCE(preferred_Subcategory,LAST_VALUE(preferred_Subcategory,true) over(partition by sales_member_id order by preferred_subcategory desc )) preferred_Subcategory

数学运算

1
2
3
4
5
sum()
avg()
std()
max()
min()

分位数

1
percentile_approx(price_sensitivity,0.3)

百分比

1
2
3
4
5
6
7
8
9
10
11
select sku_cd,category,sap_price
,case when rn>=0 and rn<0.3 then N'低价'
when rn>=0.3 and rn<0.7 then N'中价'
when rn>=0.7 and rn<=1 then N'高价' else null end as price_tier_type
into #sku_price_tier
from(
select sku_cd,category,sap_price
,round(PERCENT_RANK() OVER( partition by category ORDER BY sap_price), 5) as rn
from #sku_price
where category is not null
)t1

取下一行

1
,lead(channel,1,null) over(partition by account_id order by sap_time) as next_channel

取上一行

1
,lag(channel,1,null) over(partition by account_id order by sap_time) as next_channel

like

1
case when XX like '%礼盒%' then 1 else 0 end as if_gift

转数字类型+ 文本格式化

1
cast(regexp_replace(member_card,'JD','') as bigint) as member_card

hive 首字母大写

1
2
3
,case when member_card_grade in ('PINK','WHITE','BLACK','GOLD')
then concat(substr(member_card_grade, 1, 1), lower(substr(member_card_grade, 2)))
else null end as member_card_grade

文本筛选 判断是数字并且不为空

1
2
where  (user_id rlike '^\\d+$' or user_id is null or user_id in ('null','NULL',' ',''))
and (card_no rlike '^\\d+$' or card_no is null or card_no in ('null','NULL',' ',''))

字符串拼接

1
2
3
4
5
,case when chcnt = 1 then concat(store,'新客') when chcnt > 1 then concat(store,'老客') else concat(store,'未购买过') end as channel_status

,CONCAT(t3.brand_name collate Chinese_PRC_CS_AI_WS,' ' collate Chinese_PRC_CS_AI_WS
,t1.item_product_id collate Chinese_PRC_CS_AI_WS,' ' collate Chinese_PRC_CS_AI_WS
,t3.product_name_cn collate Chinese_PRC_CS_AI_WS) as prod

hive 行转列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
select sales_member_id
,max(case when item_brand_name = 'Lancome' then sales else 0 end) as lancome_sales
,max(case when item_brand_name = 'Lauder' then sales else 0 end) as lauder_sales
,max(case when item_brand_name = 'Guerlain' then sales else 0 end) as guerlain_sales
,max(case when item_brand_name = 'Skii' then sales else 0 end) as skll_sales
,max(case when item_brand_name = 'Shiseido' then sales else 0 end) as shiseido_sales
from (
select sales_member_id,item_brand_name
,sum(item_apportion_amount) as sales
from da_dev.tagging_system_sales_order_vb_temp
group by sales_member_id,item_brand_name
) t1
group by sales_member_id

分组 列拼接

1
2
3
4
5
6
7
8
9
10
select sales_member_id,concat_ws(',',collect_set(item_category)) as dragon_registered_category
from(
select sales_member_id,item_category
from (
select sales_member_id,sales_order_number,item_category
,row_number() over(partition by sales_member_id,sales_order_number order by place_time) as rn
from DA.tagging_system_sales_order_basic_temp
where store='丝芙兰官网'
) t1 where rn=1
) tt1 group by sales_member_id

Hive

hive 不能用 update

1
2
3
4
5
6
-- insert overwrite (aaa, bbb, ccc, ddd)
select aaa,
null as bbb,
null as ccc,
null as ddd
from Table

服务器 hive sql 常用语句

开启服务器jupyter

nohup jupyter notebook –ip=xx.xx.xxx.xxx –port=8888 &(现已开不用运行)

打开hive环境:

hive

数据库里有哪些Schema:

show databases;

使用xxSchema:

use u_analysis_app

显示这个Schema里有哪些表:

show tables;

显示xx信息:

desc tagging_daily;

按符号拆分字符串

split_part(product_articleid,’&’,1)

Powered by Hexo and Hexo-theme-hiker

Copyright © 2013 - 2024 HELLO WORLD All Rights Reserved.

UV : | PV :