11. 在线教育仓库示例-访问与咨询看板-增量

在线教育仓库示例-访问与咨询看板-增量

数据准备

CREATE TABLE web_chat_ems_2024_10 LIKE web_chat_ems_2019_07;
INSERT into web_chat_ems_2024_10  SELECT * FROM `web_chat_ems_2019_07` WHERE create_time BETWEEN '2019-07-01 00:00:00' and '2019-07-02 00:00:00';
UPDATE web_chat_ems_2024_10 set CREATE_time = CONCAT('2024-10-28 ',SUBSTR(create_time, 12))
UPDATE web_chat_ems_2024_10 set CREATE_date_time = CONCAT('2024-10-28 ',SUBSTR(create_time, 12))

CREATE TABLE web_chat_text_ems_2024_10 like web_chat_text_ems_2019_07;
INSERT into web_chat_text_ems_2024_10 SELECT temp2.* from (SELECT * FROM web_chat_ems_2019_07 WHERE create_time BETWEEN '2019-07-01 00:00:00' and '2019-07-02 00:00:00') temp1 JOIN web_chat_text_ems_2019_07 temp2 on temp1.id = temp2.id;
# 执行sqoop脚本, 完成数据增量采集
# -- 访问咨询主表
su - admin
sqoop import \
--connect jdbc:mysql://172.16.100.42:4000/nev \
--username root \
--password 84^j1TZ-+2b9A@S7Hs \
--query 'SELECT
id,create_date_time,session_id,sid,create_time,seo_source,seo_keywords,ip,
AREA,country,province,city,origin_channel,USER AS user_match,manual_time,begin_time,end_time,
last_customer_msg_time_stamp,last_agent_msg_time_stamp,reply_msg_count,
msg_count,browser_name,os_info, "2024-10-29" AS starts_time
FROM web_chat_ems_2024_10 where 1=1 and $CONDITIONS' \
--hcatalog-database itcast_ods \
--hcatalog-table web_chat_ems \
-m 1 

# -- 访问咨询附属表
sqoop import \
--connect jdbc:mysql://172.16.100.42:4000/nev \
--username root \
--password 84^j1TZ-+2b9A@S7Hs \
--query 'SELECT 
  *, "2024-10-29" AS start_time
FROM web_chat_text_ems_2024_10 where 1=1 and $CONDITIONS' \
--hcatalog-database itcast_ods \
--hcatalog-table web_chat_text_ems \
-m 1 

Sqoop 增量数据同步脚本示例

# 获取今天的日期
date
date +%Y%m%d
# 指定日期获取内容 -d或--date=
# 获取指定日期的年月日格式输出
date -d "2024-10-29" +%Y%m%d
# 获取指定日期的星期(周几)格式输出
date --date="2024-10-29" +%w
# 日期加减 -d或--date==
#获取上周日期(day,month,year,hour)
date -d "-1 week" +%Y%m%d
#获取昨天日期
date -d '-1 day' "+%Y-%m-%d"
date --date="-24 hour" +%Y%m%d
# 日期格式化规则:
%a 当前域的星期缩写 (Sun..Sat)
%A 当前域的星期全写 (Sunday..Saturday)
%b 当前域的月份缩写(Jan..Dec)
%B 当前域的月份全称 (January..December)
%d 两位的天 (01..31)
%D 短时间格式 (mm/dd/yy)
%e 短格式天 ( 1..31)
%F 文件时间格式 same as %Y-%m-%d
%h same as %b
%H 24小时制的小时 (00..23)
%I 12小时制的小时 (01..12)
%j 一年中的第几天 (001..366)
%k 短格式24小时制的小时 ( 0..23)
%m 双位月份 (01..12)
%M 双位分钟 (00..59)
%r 12小时制的时间表示(时:分:秒,双位) time, 12-hour (hh:mm:ss [AP]M)
%R 24小时制的时间表示 (时:分,双位)time, 24-hour (hh:mm)
%s 自基础时间 1970-01-01 00:00:00 到当前时刻的秒数(a GNU extension)
%T 24小时制时间表示(hh:mm:ss)
%u 数字表示的星期(从星期一开始 1-7)
%x 本地日期格式 (mm/dd/yy)
%X 本地时间格式 (%H:%M:%S)
%y 两位的年(00..99)
%Y 年 (1970…)
# 变量替换
A=Linux
echo $AB    #表示变量AB
echo ${A}B    #表示变量A后连接着B
# 命令替换
 bash shell 中,$( )与` `(反引号)都是用来作命令替换的。
命令替换与变量替换差不多,区别是一个为执行变量,一个为执行命令,先完成引号里的命令行,然后将其结果替换出来,再重组成新的命令行。
echo $(date "+%Y-%m-%d")    # 2020-07-01
echo today is `date "+%Y-%m-%d"`     # today is 2020-07-01
# 数学运算
双小括号命令是用来执行数学表达式的,可以在其中进行各种逻辑运算、数学运算,也支持更多的运算符(如++、--等)。
echo $(((5 * 2)))        #10
在(( )) 中的变量名称,可于其前面加 $ 符号来执行替换,也可以不用。
i=5
echo $(((i=$i*2)))                 #10
echo $(((i=i*2)))                 #20

# $(())
# $((( )))的缩写。
echo $(((i*2)))          #40
echo $((i*2))           #40
# 串行with与并行&
shell脚本默认是按顺序串行with执行的,使用&可以将一个命令放在后台运行,从而使shell脚本能够继续往后执行:
sleep 5 &
echo "done"
上面的脚本执行后会立即打印出"done",sleep命令被扔给后台执行,不会阻塞脚本执行。
如果想要在进入下个循环前,必须等待上个后台命令执行完毕,可以使用wait命令:
sleep 5 &
wait
echo "done"
这样,需要等待5s后才能在屏幕上看到"done"。

测试

vim sqoop_test.sh
#! /bin/bash
# SQOOP_HOME=/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/bin/sqoop
SQOOP_HOME=/usr/bin/sqoop
if [[ $1 == "" ]];then
# 默认为昨天
   TD_DATE=`date -d '1 days ago' "+%Y-%m-%d"`
else
   TD_DATE=$1
fi

# 判断TD_DATE时间格式是否正确
if [[ $TD_DATE =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]];then
    echo "TD_DATE is $TD_DATE"
else
    echo "TD_DATE is not correct"
    exit 1
fi

V_YEAR=$(date --date="${TD_DATE}" +%Y)
V_MONTH=$(date --date="${TD_DATE}" +%m)
V_TABLE_web_chat_ems="web_chat_ems_${V_YEAR}_${V_MONTH}"
V_TABLE_web_chat_text_ems="web_chat_text_ems_${V_YEAR}_${V_MONTH}"

echo $V_YEAR $V_MONTH $V_TABLE_web_chat_ems $V_TABLE_web_chat_text_ems

${SQOOP_HOME} eval \
--connect jdbc:mysql://172.16.100.42:4000/nev \
--username root \
--password 84^j1TZ-+2b9A@S7Hs \
--query "select id,
       create_date_time,
       session_id,
       sid,
       create_time,
       seo_source,
       seo_keywords,
       ip,
       area,
       country,
       province,
       city,
       origin_channel,
       user as user_match,
       manual_time,
       begin_time,
       end_time,
       last_customer_msg_time_stamp,
       last_agent_msg_time_stamp,
       reply_msg_count,
       msg_count,
       browser_name,
       os_info,
       '${TD_DATE}' as starts_time
from ${V_TABLE_web_chat_ems}
where create_time between '${TD_DATE} 00:00:00' and '${TD_DATE} 23:59:59' limit 20;"
# sh sqoop_test.sh
# sh sh.sh 2019-07-01
vim sqoop_incremental.sh
#! /bin/bash
# SQOOP_HOME=/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/bin/sqoop
SQOOP_HOME=/usr/bin/sqoop
if [[ $1 == "" ]];then
# 默认为昨天
   TD_DATE=`date -d '1 days ago' "+%Y-%m-%d"`
else
   TD_DATE=$1
fi

# 判断TD_DATE时间格式是否正确
if [[ $TD_DATE =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]];then
    echo "TD_DATE is $TD_DATE"
else
    echo "TD_DATE is not correct"
    exit 1
fi

V_YEAR=$(date --date="${TD_DATE}" +%Y)
V_MONTH=$(date --date="${TD_DATE}" +%m)
V_TABLE_web_chat_ems="web_chat_ems_${V_YEAR}_${V_MONTH}"
V_TABLE_web_chat_text_ems="web_chat_text_ems_${V_YEAR}_${V_MONTH}"

echo $V_YEAR $V_MONTH $V_TABLE_web_chat_ems $V_TABLE_web_chat_text_ems

${SQOOP_HOME} import \
--connect jdbc:mysql://172.16.100.42:4000/nev \
--username root \
--password 84^j1TZ-+2b9A@S7Hs \
--query "select id,
       create_date_time,
       session_id,
       sid,
       create_time,
       seo_source,
       seo_keywords,
       ip,
       area,
       country,
       province,
       city,
       origin_channel,
       user as user_match,
       manual_time,
       begin_time,
       end_time,
       last_customer_msg_time_stamp,
       last_agent_msg_time_stamp,
       reply_msg_count,
       msg_count,
       browser_name,
       os_info,
       '${TD_DATE}' as starts_time
from ${V_TABLE_web_chat_ems}
where create_time between '${TD_DATE} 00:00:00' and '${TD_DATE} 23:59:59' and \$CONDITIONS" \
--hcatalog-database itcast_ods \
--hcatalog-table web_chat_ems \
--hive-partition-key starts_time \
--hive-partition-value ${TD_DATE} \
-m 5 \
--split-by id

# 如果想要确保等待上个后台命令执行完毕再执行后续任务,可以使用wait命令:
wait

${SQOOP_HOME} import \
--connect jdbc:mysql://172.16.100.42:4000/nev \
--username root \
--password 84^j1TZ-+2b9A@S7Hs \
--query "
    select id,
        referrer,
        from_url,
        landing_page_url,
        url_title,
        platform_description,
        other_params,
        history,
        '${TD_DATE}' as start_time
    from ${V_TABLE_web_chat_text_ems} wcte,
        (select id as wce_id, create_time
        from ${V_TABLE_web_chat_ems}
        where create_time between '${TD_DATE} 00:00:00' and '${TD_DATE} 23:59:59') wce
    where wcte.id = wce.wce_id
    and \$CONDITIONS" \
--hcatalog-database itcast_ods \
--hcatalog-table web_chat_text_ems \
--hive-partition-key start_time \
--hive-partition-value ${TD_DATE} \
-m 5 \
--split-by id

# sh sqoop_incremental.sh 2024-10-29

访问客户量 OOzie Shell 增量清洗转换脚本示例

# 增量清洗转换时,如果同一天的分区已有旧数据,需要覆盖掉,则可以使用Insert overwrite覆盖写入,或者通过sql语句删除。
# 否则会出现数据重复,导致后续计算出错。(注意内部表不能直接删除HDFS分区)
# 需要增加where条件,指向ODS层的采集日期start_time字段,只清洗转换昨天的数据,旧数据已经计算过,不需要重复计算。
--动态分区配置
set hive.exec.dynamic.partition=true;
set hive.exec.max.dynamic.partitions=2000;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=100000;
set hive.exec.max.created.files=150000;
--hive压缩
set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
--写入时压缩生效
set hive.exec.orc.compression.strategy=COMPRESSION;

insert into table itcast_dwd.visit_consult_dwd partition (yearinfo, monthinfo, dayinfo)
select 
    wce.session_id,
    wce.sid,
    unix_timestamp(wce.create_time, 'yyyy-MM-dd HH:mm:ss.SSS') as create_time,
    wce.seo_source,
    wce.ip,
    wce.area,
    cast(if(wce.msg_count is null, 0, wce.msg_count) as int) as msg_count,
    wce.origin_channel,
    wcte.referrer,
    wcte.from_url,
    wcte.landing_page_url,
    wcte.url_title,
    wcte.platform_description,
    wcte.other_params,
    wcte.history,
    substr(wce.create_time, 12, 2) as hourinfo,
    quarter(wce.create_time) as quarterinfo,
    substr(wce.create_time, 1, 4) as yearinfo,
    substr(wce.create_time, 6, 2) as monthinfo,
    substr(wce.create_time, 9, 2) as dayinfo
from itcast_ods.web_chat_ems wce inner join itcast_ods.web_chat_text_ems wcte
on wce.id = wcte.id
where wce.starts_time='2024-10-29';

#! /bin/bash
HIVE_HOME=/usr/bin/hive
if [[ $1 == "" ]];then
# 默认为昨天
   TD_DATE=`date -d '1 days ago' "+%Y-%m-%d"`
else
   TD_DATE=$1
fi

# 判断TD_DATE时间格式是否正确
if [[ $TD_DATE =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]];then
    echo "TD_DATE is $TD_DATE"
else
    echo "TD_DATE is not correct"
    exit 1
fi

${HIVE_HOME} -S -e "
    --动态分区配置
    set hive.exec.dynamic.partition=true;
    set hive.exec.max.dynamic.partitions=2000;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.max.dynamic.partitions.pernode=10000;
    set hive.exec.max.dynamic.partitions=100000;
    set hive.exec.max.created.files=150000;
    --hive压缩
    set hive.exec.compress.intermediate=true;
    set hive.exec.compress.output=true;
    --写入时压缩生效
    set hive.exec.orc.compression.strategy=COMPRESSION;

    insert into table itcast_dwd.visit_consult_dwd partition (yearinfo, monthinfo, dayinfo)
    select
        wce.session_id,
        wce.sid,
        unix_timestamp(wce.create_time, 'yyyy-MM-dd HH:mm:ss.SSS') as create_time,
        wce.seo_source,
        wce.ip,
        wce.area,
        cast(if(wce.msg_count is null, 0, wce.msg_count) as int) as msg_count,
        wce.origin_channel,
        wcte.referrer,
        wcte.from_url,
        wcte.landing_page_url,
        wcte.url_title,
        wcte.platform_description,
        wcte.other_params,
        wcte.history,
        substr(wce.create_time,12,2) as hourinfo,
        quarter(wce.create_time) as quarterinfo,
        substr(wce.create_time,1,4) as yearinfo,
        substr(wce.create_time,6,2) as monthinfo,
        substr(wce.create_time,9,2) as dayinfo
    from itcast_ods.web_chat_ems wce inner join itcast_ods.web_chat_text_ems wcte
    on wce.id = wcte.id
    where wce.starts_time='${TD_DATE}';
"

访问客户量 OOzie Shell 增量统计分析脚本示例

# 增量统计时,同时增加where条件,只统计昨天所在的区间数据(年、月、日、时)。
注意DWS层维度较多, 小时数据和天数据(2019-07-01/2019-07-01 10:00:00),季度数据和年数据(2019-Q3/2019),不同维度存在分区目录相同的情况。
如果使用的是Insert overwrite覆盖写入,天和小时数据存在于同一个分区中,会出现新数据互相覆盖的问题。
使用Insert Into比较安全。
分区内有旧数据的话,则要先通过sql删除旧数据:
-- 精确删除数据
--删除20190901的05点数据
delete from itcast_dws.visit_dws where yearinfo='2019' and monthinfo='07' and dayinfo='01' and hourinfo='05' and time_type=1;
--删除20190901天数据
delete from itcast_dws.visit_dws where yearinfo='2019' and monthinfo='09' and dayinfo='01' and time_type=2;
--删除201909月数据
delete from itcast_dws.visit_dws where yearinfo='2019' and monthinfo='09' and dayinfo='-1' and time_type=3;
--删除2019年3季度数据
delete from itcast_dws.visit_dws where yearinfo='2019' and quarterinfo='3' and time_type=4;
--删除2019年数据
delete from itcast_dws.visit_dws where yearinfo='2019' and time_type=4;

--删除分区下所有数据
--删除20190901天和小时数据(会把日期里面的小时数据一起删除掉,月和年尽量使用delete语句)
alter table itcast_dws.visit_dws drop partition (yearinfo='2019', monthinfo='09', dayinfo='01');

#! /bin/bash
# kube-42 节点 /usr/bin/hive
HIVE_HOME=/usr/bin/hive

if [[ $1 == "" ]];then
# 默认为昨天
   TD_DATE=`date -d '1 days ago' "+%Y-%m-%d"`
else
   TD_DATE=$1
fi

# 判断TD_DATE时间格式是否正确
if [[ $TD_DATE =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]];then
    echo "TD_DATE is $TD_DATE"
else
    echo "TD_DATE is not correct"
    exit 1
fi

TD_YEAR=$(date -d "$TD_DATE" +%Y)
TD_MONTH=$(date -d "$TD_DATE" +%m)
TD_DAY=$(date -d "$TD_DATE" +%d)
month_for_quarter=`date --date="$TD_DATE" +%-m`
TD_QUARTER=$((($month_for_quarter-1)/3+1))

# hive
# -S Silent mode in interactive shell
# -e SQL from command line
# -f SQL from file

${HIVE_HOME} -S -e "
    --动态分区配置
    set hive.exec.dynamic.partition=true;
    set hive.exec.max.dynamic.partitions=2000;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.max.dynamic.partitions.pernode=10000;
    set hive.exec.max.dynamic.partitions=100000;
    set hive.exec.max.created.files=150000;
    --hive压缩
    set hive.exec.compress.intermediate=true;
    set hive.exec.compress.output=true;
    --写入时压缩生效
    set hive.exec.orc.compression.strategy=COMPRESSION;

    ---------地区分组
    --小时
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        area,
        '-1' as seo_source,
        '-1' as origin_channel,
        hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
        '-1' as from_url,
        '1' as grouptype,
        '1' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by area, yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;

    --天
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        area,
        '-1' as seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
        '-1' as from_url,
        '1' as grouptype,
        '2' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by area, yearinfo, quarterinfo, monthinfo, dayinfo;

    --月
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        area,
        '-1' as seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo) as time_str,
        '-1' as from_url,
        '1' as grouptype,
        '3' as time_type,
        yearinfo, monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH'
    group by area, yearinfo, quarterinfo, monthinfo;

    --季度
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        area,
        '-1' as seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-Q',quarterinfo) as time_str,
        '-1' as from_url,
        '1' as grouptype,
        '4' as time_type,
        yearinfo,
        '-1' as monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and quarterinfo='$TD_QUARTER'
    group by area, yearinfo, quarterinfo;
    --年
    INSERT overwrite TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
    select 
    COUNT(DISTINCT wce.sid) as sid_total,
    COUNT(DISTINCT wce.session_id) as sessionid_total,
    COUNT(DISTINCT wce.ip) as ip_total,
    wce.area as area,
    '-1' as seo_source,
    '-1' as origin_channel,
    '-1' as hourinfo,
    '-1' as quarterinfo,
    wce.yearinfo as time_str,
    '-1' as from_url,
    '1' as groupType,
    '5' as time_type,
    wce.yearinfo as yearinfo,
    '-1' as monthinfo,
    '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd wce
    where yearinfo='$TD_YEAR'
    group by wce.area,wce.yearinfo;

    ---------搜索来源分组
    --小时
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        seo_source,
        '-1' as origin_channel,
        hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
        '-1' as from_url,
        '2' as grouptype,
        '1' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by seo_source, yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;

    --天
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
        '-1' as from_url,
        '2' as grouptype,
        '2' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by seo_source, yearinfo, quarterinfo, monthinfo, dayinfo;

    --月
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo) as time_str,
        '-1' as from_url,
        '2' as grouptype,
        '3' as time_type,
        yearinfo, monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH'
    group by seo_source, yearinfo, quarterinfo, monthinfo;

    --季度
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-Q',quarterinfo) as time_str,
        '-1' as from_url,
        '2' as grouptype,
        '4' as time_type,
        yearinfo,
        '-1' as monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and quarterinfo='$TD_QUARTER'
    group by seo_source, yearinfo, quarterinfo;
    --年
    INSERT overwrite TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
    select 
    COUNT(DISTINCT wce.sid) as sid_total,
    COUNT(DISTINCT wce.session_id) as sessionid_total,
    COUNT(DISTINCT wce.ip) as ip_total,
    '-1' as  area,
    seo_source,
    '-1' as origin_channel,
    '-1' as hourinfo,
    '-1' as quarterinfo,
    wce.yearinfo as time_str,
    '-1' as from_url,
    '2' as groupType,
    '5' as time_type,
    wce.yearinfo as yearinfo,
    '-1' as monthinfo,
    '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd wce
    where yearinfo='$TD_YEAR'
    group by wce.seo_source,wce.yearinfo;

    ---------来源渠道分组
    --小时
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        origin_channel,
        hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
        '-1' as from_url,
        '3' as grouptype,
        '1' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by origin_channel, yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;

    --天
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
        '-1' as from_url,
        '3' as grouptype,
        '2' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by origin_channel, yearinfo, quarterinfo, monthinfo, dayinfo;

    --月
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo) as time_str,
        '-1' as from_url,
        '3' as grouptype,
        '3' as time_type,
        yearinfo, monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH'
    group by origin_channel, yearinfo, quarterinfo, monthinfo;

    --季度
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-Q',quarterinfo) as time_str,
        '-1' as from_url,
        '3' as grouptype,
        '4' as time_type,
        yearinfo,
        '-1' as monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and quarterinfo='$TD_QUARTER'
    group by origin_channel, yearinfo, quarterinfo;
    --年
    INSERT overwrite TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
    select 
    COUNT(DISTINCT wce.sid) as sid_total,
    COUNT(DISTINCT wce.session_id) as sessionid_total,
    COUNT(DISTINCT wce.ip) as ip_total,
    '-1' as  area,
    '-1' as seo_source,
    origin_channel,
    '-1' as hourinfo,
    '-1' as quarterinfo,
    wce.yearinfo as time_str,
    '-1' as from_url,
    '3' as groupType,
    '5' as time_type,
    wce.yearinfo as yearinfo,
    '-1' as monthinfo,
    '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd wce
    where yearinfo='$TD_YEAR'
    group by wce.origin_channel,wce.yearinfo;

    ---------会话来源页面分组
    --小时
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        '-1' as origin_channel,
        hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
        from_url,
        '4' as grouptype,
        '1' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by from_url, yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;

    --天
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
        from_url,
        '4' as grouptype,
        '2' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by from_url, yearinfo, quarterinfo, monthinfo, dayinfo;

    --月
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo) as time_str,
        from_url,
        '4' as grouptype,
        '3' as time_type,
        yearinfo, monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH'
    group by from_url, yearinfo, quarterinfo, monthinfo;

    --季度
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-Q',quarterinfo) as time_str,
        from_url,
        '4' as grouptype,
        '4' as time_type,
        yearinfo,
        '-1' as monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and quarterinfo='$TD_QUARTER'
    group by from_url, yearinfo, quarterinfo;
    --年
    INSERT overwrite TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
    select 
    COUNT(DISTINCT wce.sid) as sid_total,
    COUNT(DISTINCT wce.session_id) as sessionid_total,
    COUNT(DISTINCT wce.ip) as ip_total,
    '-1' as  area,
    '-1' as seo_source,
    '-1' as origin_channel,
    '-1' as hourinfo,
    '-1' as quarterinfo,
    wce.yearinfo as time_str,
    from_url,
    '4' as groupType,
    '5' as time_type,
    wce.yearinfo as yearinfo,
    '-1' as monthinfo,
    '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd wce
    where yearinfo='$TD_YEAR'
    group by wce.from_url,wce.yearinfo;

    ---------总访问量分组
    --小时
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        '-1' as origin_channel,
        hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo,' ',hourinfo) as time_str,
        '-1' as from_url,
        '5' as grouptype,
        '1' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by yearinfo, quarterinfo, monthinfo, dayinfo, hourinfo;

    --天
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo,'-',dayinfo) as time_str,
        '-1' as from_url,
        '5' as grouptype,
        '2' as time_type,
        yearinfo, monthinfo, dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo='$TD_DAY'
    group by yearinfo, quarterinfo, monthinfo, dayinfo;

    --月
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-',monthinfo) as time_str,
        '-1' as from_url,
        '5' as grouptype,
        '3' as time_type,
        yearinfo, monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH'
    group by yearinfo, quarterinfo, monthinfo;

    --季度
    insert into itcast_dws.visit_dws partition (yearinfo, monthinfo, dayinfo)
    select 
        count(distinct sid) as sid_total,
        count(distinct session_id) as session_total,
        count(distinct ip) as ip_total,
        '-1' as area,
        '-1' as seo_source,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat(yearinfo,'-Q',quarterinfo) as time_str,
        '-1' as from_url,
        '5' as grouptype,
        '4' as time_type,
        yearinfo,
        '-1' as monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd 
    where yearinfo='$TD_YEAR' and quarterinfo='$TD_QUARTER'
    group by yearinfo, quarterinfo;
    --年
    INSERT overwrite TABLE itcast_dws.visit_dws PARTITION (yearinfo,monthinfo,dayinfo)
    select 
    COUNT(DISTINCT wce.sid) as sid_total,
    COUNT(DISTINCT wce.session_id) as sessionid_total,
    COUNT(DISTINCT wce.ip) as ip_total,
    '-1' as  area,
    '-1' as seo_source,
    '-1' as origin_channel,
    '-1' as hourinfo,
    '-1' as quarterinfo,
    wce.yearinfo as time_str,
    '-1' as from_url,
    '5' as groupType,
    '5' as time_type,
    wce.yearinfo as yearinfo,
    '-1' as monthinfo,
    '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd wce
    where yearinfo='$TD_YEAR'
    group by wce.yearinfo;
"

访问客户量 导出数据

# 增量数据导出mysql,需要做成sh脚本,用oozie定时触发,分析流程完毕后执行导出。
# 因为每次统计最大的时间范围是年,所以每次导出都需要覆盖导出一年的数据。此时需要先将mysql中的旧数据删除,然后再将新数据导入。

#! /bin/bash
SQOOP_HOME=/usr/bin/sqoop
if [[ $1 == "" ]];then
   TD_DATE=`date -d '1 days ago' "+%Y-%m-%d"`
else
   TD_DATE=$1
fi

# 判断TD_DATE时间格式是否正确
if [[ $TD_DATE =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]];then
    echo "TD_DATE is $TD_DATE"
else
    echo "TD_DATE is not correct"
    exit 1
fi

TD_YEAR=$(date -d "$TD_DATE" +%Y)

mysql -uroot -p'84^j1TZ-+2b9A@S7Hs' -h172.16.100.42 -P4000 -e "delete from scrm_bi.itcast_visit where yearinfo='$TD_YEAR';"

wait

$SQOOP_HOME export \
--connect "jdbc:mysql://172.16.100.42:4000/scrm_bi?useUnicode=true&characterEncoding=utf-8" \
--username root \
--password 84^j1TZ-+2b9A@S7Hs \
--table visit_dws \
--hcatalog-database itcast_dws \
--hcatalog-table visit_dws \
--hcatalog-partition-keys yearinfo \
--hcatalog-partition-values $TD_YEAR \
-m 1
#### **咨询客户量实现 OOzie Shell 增量统计分析脚本示例**
#! /bin/bash
HIVE_HOME=/usr/bin/hive
if [[ $1 == "" ]];then
   TD_DATE=`date -d '1 days ago' "+%Y-%m-%d"`
else
   TD_DATE=$1
fi

# 判断TD_DATE时间格式是否正确
if [[ $TD_DATE =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]];then
    echo "TD_DATE is $TD_DATE"
else
    echo "TD_DATE is not correct"
    exit 1
fi

TD_YEAR=$(date -d "$TD_DATE" +%Y)
TD_MONTH=$(date -d "$TD_DATE" +%m)
TD_DAY=$(date -d "$TD_DATE" +%d)
TD_QUARTER=$((($TD_MONTH-1)/3+1))

$HIVE_HOME -S -e "
    --动态分区配置
    set hive.exec.dynamic.partition=true;
    set hive.exec.max.dynamic.partitions=2000;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.max.dynamic.partitions.pernode=10000;
    set hive.exec.max.dynamic.partitions=100000;
    set hive.exec.max.created.files=150000;
    --hive压缩
    set hive.exec.compress.intermediate=true;
    set hive.exec.compress.output=true;
    --写入时压缩生效
    set hive.exec.orc.compression.strategy=COMPRESSION;

    -------地区分组
    --天
    insert into itcast_dws.consult_dws partition (yearinfo, monthinfo, dayinfo)
    select
        count(distinct sid),
        count(distinct session_id),
        count(distinct ip),
        area,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat_ws('-',yearinfo,monthinfo,dayinfo),
        '1',
        '2',
        yearinfo,monthinfo,dayinfo
    from itcast_dwd.visit_consult_dwd
    where msg_count >= 1 and yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo = '$TD_DAY'
    group by area, yearinfo, quarterinfo, monthinfo, dayinfo;

    --月
    insert into itcast_dws.consult_dws partition (yearinfo, monthinfo, dayinfo)
    select
        count(distinct sid),
        count(distinct session_id),
        count(distinct ip),
        area,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat_ws('-',yearinfo,monthinfo),
        '1',
        '3',
        yearinfo,monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd
    where msg_count >= 1 and yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH'
    group by area, yearinfo, quarterinfo, monthinfo;

    --季度
    insert into itcast_dws.consult_dws partition (yearinfo, monthinfo, dayinfo)
    select
        count(distinct sid),
        count(distinct session_id),
        count(distinct ip),
        area,
        '-1' as origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat_ws('-Q',yearinfo,quarterinfo),
        '1',
        '4',
        yearinfo,'-1' as monthinfo,'-1' as dayinfo
    from itcast_dwd.visit_consult_dwd
    where msg_count >= 1 and yearinfo='$TD_YEAR' and quarterinfo='$TD_QUARTER'
    group by area, yearinfo, quarterinfo;

    --年
    insert into itcast_dws.consult_dws partition (yearinfo, monthinfo, dayinfo)
    select
        count(distinct sid),
        count(distinct session_id),
        count(distinct ip),
        area,
        '-1' as origin_channel,
        '-1' as hourinfo,
        '-1' as quarterinfo,
        yearinfo,
        '1',
        '5',
        yearinfo,'-1' as monthinfo,'-1' as dayinfo
    from itcast_dwd.visit_consult_dwd
    where msg_count >= 1 and yearinfo='$TD_YEAR'
    group by area, yearinfo;

    ------来源渠道分组
    --天
    insert into itcast_dws.consult_dws partition (yearinfo, monthinfo, dayinfo)
    select
        count(distinct sid),
        count(distinct session_id),
        count(distinct ip),
        '-1' as area,
        origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat_ws('-',yearinfo,monthinfo,dayinfo),
        '2',
        '2',
        yearinfo,monthinfo,dayinfo
    from itcast_dwd.visit_consult_dwd
    where msg_count >= 1 and yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH' and dayinfo = '$TD_DAY'
    group by origin_channel, yearinfo, quarterinfo, monthinfo, dayinfo;

    --月
    insert into itcast_dws.consult_dws partition (yearinfo, monthinfo, dayinfo)
    select
        count(distinct sid),
        count(distinct session_id),
        count(distinct ip),
        '-1' as area,
        origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat_ws('-',yearinfo,monthinfo),
        '2',
        '3',
        yearinfo,monthinfo,
        '-1' as dayinfo
    from itcast_dwd.visit_consult_dwd
    where msg_count >= 1 and yearinfo='$TD_YEAR' and monthinfo='$TD_MONTH'
    group by origin_channel, yearinfo, quarterinfo, monthinfo;

    --季度
    insert into itcast_dws.consult_dws partition (yearinfo, monthinfo, dayinfo)
    select
        count(distinct sid),
        count(distinct session_id),
        count(distinct ip),
        '-1' as area,
        origin_channel,
        '-1' as hourinfo,
        quarterinfo,
        concat_ws('-Q',yearinfo,quarterinfo),
        '2',
        '4',
        yearinfo,'-1' as monthinfo,'-1' as dayinfo
    from itcast_dwd.visit_consult_dwd
    where msg_count >= 1 and yearinfo='$TD_YEAR' and quarterinfo='$TD_QUARTER'
    group by origin_channel, yearinfo, quarterinfo;

    --年
    insert into itcast_dws.consult_dws partition (yearinfo, monthinfo, dayinfo)
    select
        count(distinct sid),
        count(distinct session_id),
        count(distinct ip),
        '-1' as area,
        origin_channel,
        '-1' as hourinfo,
        '-1' as quarterinfo,
        yearinfo,
        '2',
        '5',
        yearinfo,'-1' as monthinfo,'-1' as dayinfo
    from itcast_dwd.visit_consult_dwd
    where msg_count >= 1 and yearinfo='$TD_YEAR'
    group by origin_channel, yearinfo;
"

咨询客户量实现 导出数据

#! /bin/bash
SQOOP_HOME=/usr/bin/sqoop
if [[ $1 == "" ]];then
   TD_DATE=`date -d '1 days ago' "+%Y-%m-%d"`
else
   TD_DATE=$1
fi

# 判断TD_DATE时间格式是否正确
if [[ $TD_DATE =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]];then
    echo "TD_DATE is $TD_DATE"
else
    echo "TD_DATE is not correct"
    exit 1
fi

TD_YEAR=$(date -d "$TD_DATE" +%Y)

mysql  -uroot -p'84^j1TZ-+2b9A@S7Hs' -h172.16.100.42 -P4000 -e "
    delete from scrm_bi.itcast_consult where yearinfo='$TD_YEAR';
"

wait

$SQOOP_HOME export \
--connect "jdbc:mysql://172.16.100.42:4000/scrm_bi?useUnicode=true&characterEncoding=utf-8" \
--username root \
--password 84^j1TZ-+2b9A@S7Hs \
--table consult_dws \
--hcatalog-database itcast_dws \
--hcatalog-table consult_dws \
--hcatalog-partition-keys yearinfo \
--hcatalog-partition-values $TD_YEAR \
-m 1