16. 实战示例-维度建模

实战示例-维度建模

行政地区维度构建

-- 建库
    create database if not exists one_make_dws;
-- 建维度表
    -- 区域粒度【乡镇】
    create external table if not exists one_make_dws.dim_location_areas(
        id string comment 'id'
        , province_id string comment '省份ID'
        , province string comment '省份名称'
        , province_short_name string comment '省份短名称'
        , city_id string comment '城市ID'
        , city string comment '城市'
        , city_short_name string comment '城市短名称'
        , county_id string comment '县城ID'
        , county string comment '县城'
        , county_short_name string comment '县城短名称'
        , area_id string comment '区域ID'
        , area string comment '区域名称'
        , area_short_name string comment '区域短名称'
    ) comment '区域维度区域级别表'
    stored as orc
    tblproperties ("orc.compress"="SNAPPY")
    location '/test/dws/one_make/dim_location_areas';

    -- 县区粒度
    create external table if not exists one_make_dws.dim_location_county(
        id string comment 'id'
        , province_id string comment '省份ID'
        , province string comment '省份名称'
        , province_short_name string comment '省份短名称'
        , city_id string comment '城市ID'
        , city string comment '城市'
        , city_short_name string comment '城市短名称'
        , county_id string comment '县城ID'
        , county string comment '县城'
        , county_short_name string comment '县城短名称'
    ) comment '区域维度表(县城粒度)'
    stored as orc
    tblproperties ("orc.compress"="SNAPPY")
    location '/test/dws/one_make/dim_location_county';    

-- 抽取数据
    -- 区域粒度
    insert overwrite table one_make_dws.dim_location_areas
    select
        /*+repartition(1) */
        t_area.id as id,
        t_province.id as province_id,
        t_province.areaname as province,
        t_province.shortname as province_short_name,
        t_city.id as city_id,
        t_city.areaname as city,
        t_city.shortname as city_short_name,
        t_county.id as county_id,
        t_county.areaname as county,
        t_county.shortname as county_short_name,
        t_area.id as area_id,
        t_area.areaname as area,
        t_area.shortname area_short_name
    from
        one_make_dwd.ciss_base_areas t_area
        inner join one_make_dwd.ciss_base_areas t_county on t_area.rank = 4 and t_area.parentid = t_county.id
        inner join one_make_dwd.ciss_base_areas t_city on t_county.parentid = t_city.id
        inner join one_make_dwd.ciss_base_areas t_province on t_city.parentid = t_province.id
        inner join one_make_dwd.ciss_base_areas t_nation on t_province.parentid = t_nation.id
    ;

    -- 县区粒度
    insert overwrite table one_make_dws.dim_location_county
    select
        /*+repartition(1) */
        t_county.id as id,
        t_province.id as province_id,
        t_province.areaname as province,
        t_province.shortname as province_short_name,
        t_city.id as city_id,
        t_city.areaname as city,
        t_city.shortname as city_short_name,
        t_county.id as county_id,
        t_county.areaname as county,
        t_county.shortname as county_short_name
    from
        one_make_dwd.ciss_base_areas t_county
        inner join one_make_dwd.ciss_base_areas t_city on t_county.rank =3 and t_county.parentid = t_city.id
        inner join one_make_dwd.ciss_base_areas t_province on t_city.parentid = t_province.id
        inner join one_make_dwd.ciss_base_areas t_nation on t_province.parentid = t_nation.id
    ;

日期时间维度构建

-- 建维度表
    -- 创建日期维度表,日期维度表按照年份分区
    create external table if not exists one_make_dws.dim_date(
        date_id string comment '日期id'
        , year_name_cn string comment '年份名称(中文)'
        , year_month_id string comment '年月id'
        , year_month_cn string comment '年月(中文)'
        , quota_id string comment '季度id'
        , quota_namecn string comment '季度名称(中文)'
        , quota_nameen string comment '季度名称(英文)'
        , quota_shortnameen string comment '季度名称(英文简写)'
        , week_in_year_id string comment '周id'
        , week_in_year_name_cn string comment '周(中文)'
        , week_in_year_name_en string comment '周(英文)'
        , weekday int comment '星期'
        , weekday_cn string comment '星期(中文)'
        , weekday_en string comment '星期(英文)'
        , weekday_short_name_en string comment '星期(英文缩写)'
        , yyyymmdd string comment '日期_yyyy_mm_dd'
        , yyyymmdd_cn string comment '日期中文'
        , is_workday string comment '是否工作日'
        , is_weekend string comment '是否周末'
        , is_holiday string comment '是否法定节假日'
        , date_type string comment '日期类型'
    ) comment '时间维度表'
    partitioned by (year integer)
    stored as orc
    location '/test/dws/one_make/dim_date'
    ;

-- 加载数据
  -- HDFS创建路径
    hdfs dfs -mkdir -p /test/dws/one_make/dim_date/2021
    hdfs dfs -put part-00000-cf2fc4b3-7485-4861-81e7-da0c3f76e6de-c000.snappy.orc /test/dws/one_make/dim_date/2021/

  -- 申明分区
    alter table one_make_dws.dim_date add if not exists partition (year='2021') location '/test/dws/one_make/dim_date/2021';

  -- 查看数据
    select * from one_make_dws.dim_date;

服务网点维度构建

-- 建维度表
  -- 服务网点维度表
    create external table if not exists one_make_dws.dim_srv_station(
        id string comment '服务网点id'
        , name string comment '服务网点名称'
        ,code string comment '网点编号'
        ,province_id string comment '省份id'
        ,province string comment '省份名称'
        ,city_id string comment '城市id'
        ,city string comment '城市'
        ,county_id string comment '县城id'
        ,county string comment '县城'
        ,status string comment '服务网点状态'
        ,status_name string comment '状态中文名'
        ,org_id string comment '所属组织机构id'
        ,org_name string comment '所属组件机构名称'
    )comment '服务网点维度表'
    partitioned by (dt string)
    stored as orc
    location '/test/dws/one_make/dim_srv_station';

  -- 加载数据
    insert overwrite table one_make_dws.dim_srv_station partition(dt='20210101')
    select
        station.id
        , station.name
        , station.code
        , province.id as province_id
        , province.areaname as province
        , city.id as city_id
        , city.areaname as city
        , county.id as county_id
        , county.areaname as county
        , station.status as status
        , dict_e.dictname as status_name
        , station.org_id as org_id
        , station.org_name as org_name
    from one_make_dwd.ciss_base_servicestation station
    -- 关联省份RANK为1
    left join one_make_dwd.ciss_base_areas province on station.dt = '20210101' and station.province = province.id and province.rank = 1   
    -- 关联城市RANK为2
    left join one_make_dwd.ciss_base_areas city on station.city = city.id and city.rank = 2 
    -- 关联城市RANK为3
    left join one_make_dwd.ciss_base_areas county on station.region = county.id and county.rank = 3
    -- 关联字典父表(dict_t)
    cross join one_make_dwd.eos_dict_type dict_t  on dict_t.dt = '20210101' and dict_t.dicttypename = '服务网点使用状态'
    -- 关联字典子表(dict_e)
    left join one_make_dwd.eos_dict_entry dict_e on dict_e.dt = '20210101' and dict_t.dicttypeid = dict_e.dicttypeid and station.status = dict_e.dictid;

  -- cross join:笛卡尔积,join时候不给关联条件
      - select * from A,B
      - select * from A join B
      - cross join

油站维度构建

-- 建维度表**
    -- 创建油站维度表
    create external table if not exists one_make_dws.dim_oilstation(
        id string comment '油站ID'
        , name string comment '油站名称'
        , code string comment '油站编码'
        , customer_id string comment '客户ID'
        , customer_name string comment '客户名称'
        , province_id int comment '省份id'
        , province_name string comment '省份名称'
        , city_id int comment '城市id'
        , city_name string comment '城市名称'
        , county_id int comment '县城ID'
        , county_name string comment '县城名称'
        , area_id int comment '区域id'
        , area_name string comment '区域名称'
        , customer_classify_id string comment '客户分类ID'
        , customer_classify_name string comment '客户分类名称'
        , status int comment '油站状态(1、2)'
        , status_name string comment '油站状态名(正常、停用)'
        , company_id int comment '所属公司ID'
        , company_name string comment '所属公司名称'
        , customer_province_id int comment '客户所属省份ID'
        , customer_province_name string comment '客户所属省份'
    ) COMMENT '油站维度表'
    PARTITIONED BY (dt STRING)
    STORED AS TEXTFILE
    LOCATION '/test/dws/one_make/dim_oilstation';

-- 抽取数据
    insert overwrite table one_make_dws.dim_oilstation partition (dt ='20210101')
    select oil.id, oil.name, oil.code, customer_id, customer_name
           , oil.province province_id, p.areaname province_name
           , oil.city city_id, c.areaname city_name
           , oil.region county_id, county.areaname county_name
           , oil.township area_id, a.areaname area_name
           , oil.customer_classify customer_classify_id, ede.dictname customer_classify_name
           , oil.status status, eosde.dictname status_name
           , cbc.company company_id, binfo.companyname company_name
           , proname.id customer_province_id, proname.areaname customer_province_name
    from (
         select id, name, code, customer_id, customer_name, province, city, region, township, status, customer_classify, dt
         from one_make_dwd.ciss_base_oilstation where id != '' and name is not null and name != 'null' and customer_id is not null
         ) oil
         left join (select id, areaname, parentid from one_make_dwd.ciss_base_areas where rank = 1) p on oil.province = p.id
         left join (select id, areaname, parentid from one_make_dwd.ciss_base_areas where rank = 2) c on oil.city = c.id
         left join (select id, areaname, parentid from one_make_dwd.ciss_base_areas where rank = 3) county on oil.region = county.id
         left join (select id, areaname, parentid from one_make_dwd.ciss_base_areas where rank = 4) a on oil.township = a.id
         left join (select dictid, dictname  from one_make_dwd.eos_dict_entry) ede on oil.customer_classify = ede.dictid
         left join (select dictid, dictname from one_make_dwd.eos_dict_entry t1  left join one_make_dwd.eos_dict_type t2 on t1.dicttypeid = t2.dicttypeid where t2.dicttypename = '油站状态') eosde on oil.status = eosde.dictid
         -- 客户所属公司id,所属公司名称,所属省份id,所属省份名称
         left join (select code, province, company from one_make_dwd.ciss_base_customer) cbc on oil.customer_id = cbc.code
         left join (select id, areaname from one_make_dwd.ciss_base_areas where rank = 1 and id != 83) proname on cbc.province = proname.areaname
         left join (select ygcode, companyname from one_make_dwd.ciss_base_baseinfo group by ygcode, companyname) binfo on cbc.company = binfo.ygcode where dt = '20210101';

其他维度

---- 组织机构
  -- 建维度表
    -- 创建组织机构维度表,组织机构人员是经常变动的,所以按照日期分区
    create external table if not exists one_make_dws.dim_emporg(
        empid string comment '人员id'   
        , empcode string comment '人员编码(erp对应的账号id)'
        , empname string comment '人员姓名'
        , userid string comment '用户系统id(登录用户名)'
        , posid string comment '岗位id'
        , posicode string comment '岗位编码'
        , posiname string comment '岗位名称'
        , orgid string comment '部门id'
        , orgcode string comment '部门编码'
        , orgname string comment '部门名称'
    ) comment '组织机构维度表'
    partitioned by (dt string)
    stored as orc
    location '/test/dws/one_make/dim_emporg';

  -- 抽取数据
    -- 先根据dwd层的表进行关联,然后分别把数据取出来
    insert overwrite table one_make_dws.dim_emporg partition(dt='20210101')
    select
        emp.empid as empid
        , emp.empcode as empcode
        , emp.empname as empname
        , emp.userid as userid
        , pos.positionid as posid
        , pos.posicode as posicode
        , pos.posiname as posiname
        , org.orgid as orgid
        , org.orgcode as orgcode
        , org.orgname as orgname
    from  one_make_dwd.org_employee emp
    left join one_make_dwd.org_empposition emppos
        on emp.empid = emppos.empid and emp.dt = '20210101' and emppos.dt = '20210101'
    left join one_make_dwd.org_position pos
        on emppos.positionid = pos.positionid and pos.dt = '20210101'
    left join one_make_dwd.org_organization org
        on pos.orgid = org.orgid and org.dt = '20210101';

---- 仓库、物流
    -- 仓库维度表
      create external table if not exists one_make_dws.dim_warehouse(
          code string comment '仓库编码'
          , name string comment '仓库名称'
          , company_id string comment '所属公司'
          , company string comment '公司名称'
          , srv_station_id string comment '所属服务网点ID'
          , srv_station_name string comment '所属服务网点名称'
      )comment '仓库维度表'
      partitioned by (dt string)
      stored as orc
      location '/test/dws/one_make/dim_warehouse';

    --  加载数据
      insert overwrite table one_make_dws.dim_warehouse partition(dt='20210101')
      select
          warehouse.code as code
          , warehouse.name as name
          , warehouse.company as company_id
          , cmp.compmay as compmay
          , station.id as srv_station_id
          , station.name as srv_station_name
      from
          one_make_dwd.ciss_base_warehouse warehouse
      -- 关联公司信息表
      left join (
           select
                 ygcode as company_id, max(companyname) as compmay
           from one_make_dwd.ciss_base_baseinfo where dt='20210101'
           -- 需要对company信息进行分组去重,里面有一些重复数据 
           group by ygcode) cmp
           on warehouse.dt = '20210101' and cmp.company_id = warehouse.company
      -- 关联服务网点和仓库关系表
      left join one_make_dwd.ciss_r_serstation_warehouse station_r_warehouse
           on station_r_warehouse.dt = '20210101' and station_r_warehouse.warehouse_code = warehouse.code
      -- 关联服务网点表 
      left join one_make_dwd.ciss_base_servicestation station
           on station.dt = '20210101' and station.id = station_r_warehouse.service_station_id;

    -- 物流维度表
      -- 物流维度表(和服务属性表类似)
      create external table if not exists one_make_dws.dim_logistics(
          prop_name string comment '字典名称'
          , type_id string comment '属性id'
          , type_name string comment '属性名称'
      )comment '物流维度表'
      partitioned by (dt string)
      stored as orc

      -- 加载
      insert overwrite table one_make_dws.dim_logistics partition(dt = '20210101')
      select
          dict_t.dicttypename as prop_name
          , dict_e.dictid as type_id
          , dict_e.dictname as type_name
      from  one_make_dwd.eos_dict_type dict_t
      inner join one_make_dwd.eos_dict_entry dict_e
          on dict_t.dt = '20210101'
              and dict_e.dt = '20210101'
              and dict_t.dicttypeid = dict_e.dicttypeid
              and dict_t.dicttypename in (
                  '物流公司'
                  , '物流类型'
              )
      order by dict_t.dicttypename, dict_e.dictid;

常见错误

1. 错误:没有开启Cross Join
  - Spark2.x默认不允许执行笛卡尔积,除非**显示申明cross join**或者开启属性:`spark.sql.crossJoin.enabled  true`

Exception in thread "main" org.apache.spark.sql.AnalysisException: Detected implicit cartesian product for INNER join between logical plans.Use the CROSS JOIN syntax to allow cartesian products between these relations

2. 错误:Unable to move source
  - 重启SparkSQL的ThriftServer,与MetaStore构建新的会话连接

Error: org.apache.spark.sql.AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to move source hdfs://hadoop.bigdata.cn:9000/test/dws/one_make/dim_warehouse/.hive-staging_hive_2020-12-23_04-26-01_363_5663538019799519260-16/-ext-10000/part-00000-63069107-6405-4e31-a55a-6bdeefcd7d9b-c000 to destination hdfs://hadoop.bigdata.cn:9000/test/dws/one_make/dim_warehouse/dt=20210101/part-00000-63069107-6405-4e31-a55a-6bdeefcd7d9b-c000; (state=,code=0)