实战示例-维度建模
行政地区维度构建
-- 建库
create database if not exists one_make_dws;
-- 建维度表
-- 区域粒度【乡镇】
create external table if not exists one_make_dws.dim_location_areas(
id string comment 'id'
, province_id string comment '省份ID'
, province string comment '省份名称'
, province_short_name string comment '省份短名称'
, city_id string comment '城市ID'
, city string comment '城市'
, city_short_name string comment '城市短名称'
, county_id string comment '县城ID'
, county string comment '县城'
, county_short_name string comment '县城短名称'
, area_id string comment '区域ID'
, area string comment '区域名称'
, area_short_name string comment '区域短名称'
) comment '区域维度区域级别表'
stored as orc
tblproperties ("orc.compress"="SNAPPY")
location '/test/dws/one_make/dim_location_areas';
-- 县区粒度
create external table if not exists one_make_dws.dim_location_county(
id string comment 'id'
, province_id string comment '省份ID'
, province string comment '省份名称'
, province_short_name string comment '省份短名称'
, city_id string comment '城市ID'
, city string comment '城市'
, city_short_name string comment '城市短名称'
, county_id string comment '县城ID'
, county string comment '县城'
, county_short_name string comment '县城短名称'
) comment '区域维度表(县城粒度)'
stored as orc
tblproperties ("orc.compress"="SNAPPY")
location '/test/dws/one_make/dim_location_county';
-- 抽取数据
-- 区域粒度
insert overwrite table one_make_dws.dim_location_areas
select
/*+repartition(1) */
t_area.id as id,
t_province.id as province_id,
t_province.areaname as province,
t_province.shortname as province_short_name,
t_city.id as city_id,
t_city.areaname as city,
t_city.shortname as city_short_name,
t_county.id as county_id,
t_county.areaname as county,
t_county.shortname as county_short_name,
t_area.id as area_id,
t_area.areaname as area,
t_area.shortname area_short_name
from
one_make_dwd.ciss_base_areas t_area
inner join one_make_dwd.ciss_base_areas t_county on t_area.rank = 4 and t_area.parentid = t_county.id
inner join one_make_dwd.ciss_base_areas t_city on t_county.parentid = t_city.id
inner join one_make_dwd.ciss_base_areas t_province on t_city.parentid = t_province.id
inner join one_make_dwd.ciss_base_areas t_nation on t_province.parentid = t_nation.id
;
-- 县区粒度
insert overwrite table one_make_dws.dim_location_county
select
/*+repartition(1) */
t_county.id as id,
t_province.id as province_id,
t_province.areaname as province,
t_province.shortname as province_short_name,
t_city.id as city_id,
t_city.areaname as city,
t_city.shortname as city_short_name,
t_county.id as county_id,
t_county.areaname as county,
t_county.shortname as county_short_name
from
one_make_dwd.ciss_base_areas t_county
inner join one_make_dwd.ciss_base_areas t_city on t_county.rank =3 and t_county.parentid = t_city.id
inner join one_make_dwd.ciss_base_areas t_province on t_city.parentid = t_province.id
inner join one_make_dwd.ciss_base_areas t_nation on t_province.parentid = t_nation.id
;
日期时间维度构建
-- 建维度表
-- 创建日期维度表,日期维度表按照年份分区
create external table if not exists one_make_dws.dim_date(
date_id string comment '日期id'
, year_name_cn string comment '年份名称(中文)'
, year_month_id string comment '年月id'
, year_month_cn string comment '年月(中文)'
, quota_id string comment '季度id'
, quota_namecn string comment '季度名称(中文)'
, quota_nameen string comment '季度名称(英文)'
, quota_shortnameen string comment '季度名称(英文简写)'
, week_in_year_id string comment '周id'
, week_in_year_name_cn string comment '周(中文)'
, week_in_year_name_en string comment '周(英文)'
, weekday int comment '星期'
, weekday_cn string comment '星期(中文)'
, weekday_en string comment '星期(英文)'
, weekday_short_name_en string comment '星期(英文缩写)'
, yyyymmdd string comment '日期_yyyy_mm_dd'
, yyyymmdd_cn string comment '日期中文'
, is_workday string comment '是否工作日'
, is_weekend string comment '是否周末'
, is_holiday string comment '是否法定节假日'
, date_type string comment '日期类型'
) comment '时间维度表'
partitioned by (year integer)
stored as orc
location '/test/dws/one_make/dim_date'
;
-- 加载数据
-- HDFS创建路径
hdfs dfs -mkdir -p /test/dws/one_make/dim_date/2021
hdfs dfs -put part-00000-cf2fc4b3-7485-4861-81e7-da0c3f76e6de-c000.snappy.orc /test/dws/one_make/dim_date/2021/
-- 申明分区
alter table one_make_dws.dim_date add if not exists partition (year='2021') location '/test/dws/one_make/dim_date/2021';
-- 查看数据
select * from one_make_dws.dim_date;
服务网点维度构建
-- 建维度表
-- 服务网点维度表
create external table if not exists one_make_dws.dim_srv_station(
id string comment '服务网点id'
, name string comment '服务网点名称'
,code string comment '网点编号'
,province_id string comment '省份id'
,province string comment '省份名称'
,city_id string comment '城市id'
,city string comment '城市'
,county_id string comment '县城id'
,county string comment '县城'
,status string comment '服务网点状态'
,status_name string comment '状态中文名'
,org_id string comment '所属组织机构id'
,org_name string comment '所属组件机构名称'
)comment '服务网点维度表'
partitioned by (dt string)
stored as orc
location '/test/dws/one_make/dim_srv_station';
-- 加载数据
insert overwrite table one_make_dws.dim_srv_station partition(dt='20210101')
select
station.id
, station.name
, station.code
, province.id as province_id
, province.areaname as province
, city.id as city_id
, city.areaname as city
, county.id as county_id
, county.areaname as county
, station.status as status
, dict_e.dictname as status_name
, station.org_id as org_id
, station.org_name as org_name
from one_make_dwd.ciss_base_servicestation station
-- 关联省份RANK为1
left join one_make_dwd.ciss_base_areas province on station.dt = '20210101' and station.province = province.id and province.rank = 1
-- 关联城市RANK为2
left join one_make_dwd.ciss_base_areas city on station.city = city.id and city.rank = 2
-- 关联城市RANK为3
left join one_make_dwd.ciss_base_areas county on station.region = county.id and county.rank = 3
-- 关联字典父表(dict_t)
cross join one_make_dwd.eos_dict_type dict_t on dict_t.dt = '20210101' and dict_t.dicttypename = '服务网点使用状态'
-- 关联字典子表(dict_e)
left join one_make_dwd.eos_dict_entry dict_e on dict_e.dt = '20210101' and dict_t.dicttypeid = dict_e.dicttypeid and station.status = dict_e.dictid;
-- cross join:笛卡尔积,join时候不给关联条件
- select * from A,B
- select * from A join B
- cross join
油站维度构建
-- 建维度表**
-- 创建油站维度表
create external table if not exists one_make_dws.dim_oilstation(
id string comment '油站ID'
, name string comment '油站名称'
, code string comment '油站编码'
, customer_id string comment '客户ID'
, customer_name string comment '客户名称'
, province_id int comment '省份id'
, province_name string comment '省份名称'
, city_id int comment '城市id'
, city_name string comment '城市名称'
, county_id int comment '县城ID'
, county_name string comment '县城名称'
, area_id int comment '区域id'
, area_name string comment '区域名称'
, customer_classify_id string comment '客户分类ID'
, customer_classify_name string comment '客户分类名称'
, status int comment '油站状态(1、2)'
, status_name string comment '油站状态名(正常、停用)'
, company_id int comment '所属公司ID'
, company_name string comment '所属公司名称'
, customer_province_id int comment '客户所属省份ID'
, customer_province_name string comment '客户所属省份'
) COMMENT '油站维度表'
PARTITIONED BY (dt STRING)
STORED AS TEXTFILE
LOCATION '/test/dws/one_make/dim_oilstation';
-- 抽取数据
insert overwrite table one_make_dws.dim_oilstation partition (dt ='20210101')
select oil.id, oil.name, oil.code, customer_id, customer_name
, oil.province province_id, p.areaname province_name
, oil.city city_id, c.areaname city_name
, oil.region county_id, county.areaname county_name
, oil.township area_id, a.areaname area_name
, oil.customer_classify customer_classify_id, ede.dictname customer_classify_name
, oil.status status, eosde.dictname status_name
, cbc.company company_id, binfo.companyname company_name
, proname.id customer_province_id, proname.areaname customer_province_name
from (
select id, name, code, customer_id, customer_name, province, city, region, township, status, customer_classify, dt
from one_make_dwd.ciss_base_oilstation where id != '' and name is not null and name != 'null' and customer_id is not null
) oil
left join (select id, areaname, parentid from one_make_dwd.ciss_base_areas where rank = 1) p on oil.province = p.id
left join (select id, areaname, parentid from one_make_dwd.ciss_base_areas where rank = 2) c on oil.city = c.id
left join (select id, areaname, parentid from one_make_dwd.ciss_base_areas where rank = 3) county on oil.region = county.id
left join (select id, areaname, parentid from one_make_dwd.ciss_base_areas where rank = 4) a on oil.township = a.id
left join (select dictid, dictname from one_make_dwd.eos_dict_entry) ede on oil.customer_classify = ede.dictid
left join (select dictid, dictname from one_make_dwd.eos_dict_entry t1 left join one_make_dwd.eos_dict_type t2 on t1.dicttypeid = t2.dicttypeid where t2.dicttypename = '油站状态') eosde on oil.status = eosde.dictid
-- 客户所属公司id,所属公司名称,所属省份id,所属省份名称
left join (select code, province, company from one_make_dwd.ciss_base_customer) cbc on oil.customer_id = cbc.code
left join (select id, areaname from one_make_dwd.ciss_base_areas where rank = 1 and id != 83) proname on cbc.province = proname.areaname
left join (select ygcode, companyname from one_make_dwd.ciss_base_baseinfo group by ygcode, companyname) binfo on cbc.company = binfo.ygcode where dt = '20210101';
其他维度
---- 组织机构
-- 建维度表
-- 创建组织机构维度表,组织机构人员是经常变动的,所以按照日期分区
create external table if not exists one_make_dws.dim_emporg(
empid string comment '人员id'
, empcode string comment '人员编码(erp对应的账号id)'
, empname string comment '人员姓名'
, userid string comment '用户系统id(登录用户名)'
, posid string comment '岗位id'
, posicode string comment '岗位编码'
, posiname string comment '岗位名称'
, orgid string comment '部门id'
, orgcode string comment '部门编码'
, orgname string comment '部门名称'
) comment '组织机构维度表'
partitioned by (dt string)
stored as orc
location '/test/dws/one_make/dim_emporg';
-- 抽取数据
-- 先根据dwd层的表进行关联,然后分别把数据取出来
insert overwrite table one_make_dws.dim_emporg partition(dt='20210101')
select
emp.empid as empid
, emp.empcode as empcode
, emp.empname as empname
, emp.userid as userid
, pos.positionid as posid
, pos.posicode as posicode
, pos.posiname as posiname
, org.orgid as orgid
, org.orgcode as orgcode
, org.orgname as orgname
from one_make_dwd.org_employee emp
left join one_make_dwd.org_empposition emppos
on emp.empid = emppos.empid and emp.dt = '20210101' and emppos.dt = '20210101'
left join one_make_dwd.org_position pos
on emppos.positionid = pos.positionid and pos.dt = '20210101'
left join one_make_dwd.org_organization org
on pos.orgid = org.orgid and org.dt = '20210101';
---- 仓库、物流
-- 仓库维度表
create external table if not exists one_make_dws.dim_warehouse(
code string comment '仓库编码'
, name string comment '仓库名称'
, company_id string comment '所属公司'
, company string comment '公司名称'
, srv_station_id string comment '所属服务网点ID'
, srv_station_name string comment '所属服务网点名称'
)comment '仓库维度表'
partitioned by (dt string)
stored as orc
location '/test/dws/one_make/dim_warehouse';
-- 加载数据
insert overwrite table one_make_dws.dim_warehouse partition(dt='20210101')
select
warehouse.code as code
, warehouse.name as name
, warehouse.company as company_id
, cmp.compmay as compmay
, station.id as srv_station_id
, station.name as srv_station_name
from
one_make_dwd.ciss_base_warehouse warehouse
-- 关联公司信息表
left join (
select
ygcode as company_id, max(companyname) as compmay
from one_make_dwd.ciss_base_baseinfo where dt='20210101'
-- 需要对company信息进行分组去重,里面有一些重复数据
group by ygcode) cmp
on warehouse.dt = '20210101' and cmp.company_id = warehouse.company
-- 关联服务网点和仓库关系表
left join one_make_dwd.ciss_r_serstation_warehouse station_r_warehouse
on station_r_warehouse.dt = '20210101' and station_r_warehouse.warehouse_code = warehouse.code
-- 关联服务网点表
left join one_make_dwd.ciss_base_servicestation station
on station.dt = '20210101' and station.id = station_r_warehouse.service_station_id;
-- 物流维度表
-- 物流维度表(和服务属性表类似)
create external table if not exists one_make_dws.dim_logistics(
prop_name string comment '字典名称'
, type_id string comment '属性id'
, type_name string comment '属性名称'
)comment '物流维度表'
partitioned by (dt string)
stored as orc
-- 加载
insert overwrite table one_make_dws.dim_logistics partition(dt = '20210101')
select
dict_t.dicttypename as prop_name
, dict_e.dictid as type_id
, dict_e.dictname as type_name
from one_make_dwd.eos_dict_type dict_t
inner join one_make_dwd.eos_dict_entry dict_e
on dict_t.dt = '20210101'
and dict_e.dt = '20210101'
and dict_t.dicttypeid = dict_e.dicttypeid
and dict_t.dicttypename in (
'物流公司'
, '物流类型'
)
order by dict_t.dicttypename, dict_e.dictid;
常见错误
1. 错误:没有开启Cross Join
- Spark2.x默认不允许执行笛卡尔积,除非**显示申明cross join**或者开启属性:`spark.sql.crossJoin.enabled true`
Exception in thread "main" org.apache.spark.sql.AnalysisException: Detected implicit cartesian product for INNER join between logical plans.Use the CROSS JOIN syntax to allow cartesian products between these relations
2. 错误:Unable to move source
- 重启SparkSQL的ThriftServer,与MetaStore构建新的会话连接
Error: org.apache.spark.sql.AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to move source hdfs://hadoop.bigdata.cn:9000/test/dws/one_make/dim_warehouse/.hive-staging_hive_2020-12-23_04-26-01_363_5663538019799519260-16/-ext-10000/part-00000-63069107-6405-4e31-a55a-6bdeefcd7d9b-c000 to destination hdfs://hadoop.bigdata.cn:9000/test/dws/one_make/dim_warehouse/dt=20210101/part-00000-63069107-6405-4e31-a55a-6bdeefcd7d9b-c000; (state=,code=0)