实战示例-库表准备
Hive建表语法
CREATE [ TEMPORARY ] [ EXTERNAL] TABLE [ IF NOT EXISTS ] [ db_name. ] table_name
(
col1Name col1Type [ COMMENT col_comment] ,
co21Name col2Type [ COMMENT col_comment] ,
co31Name col3Type [ COMMENT col_comment] ,
co41Name col4Type [ COMMENT col_comment] ,
co51Name col5Type [ COMMENT col_comment] ,
……
coN1Name colNType [ COMMENT col_comment]
)
[ PARTITIONED BY ( col_name data_type . . . ) ]
[ CLUSTERED BY ( col_name. . . ) [ SORTED BY ( col_name . . . ) ] INTO N BUCKETS]
[ ROW FORMAT row_format]
row format delimited fields terminated by
lines terminated by
[ STORED AS file_format]
[ LOCATION hdfs_path]
TBLPROPERTIES
Copy
EXTERNAL:外部表类型
PARTITIONED BY:分区表结构
CLUSTERED BY:分桶表结构
ROW FORMAT:指定分隔符
STORED AS:指定文件存储类型
LOCATION:指定表对应的HDFS上的地址
默认:/user/hive/warehouse/dbdir/tbdir
TBLPROPERTIES:指定一些表的额外的一些特殊配置属性
Avro建表语法
指定文件类型
方式一:指定类型
方式二:指定解析类
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
Copy
指定Schema
方式一:手动定义Schema
CREATE TABLE embedded
COMMENT "这是表的注释"
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.literal' = '{
"namespace": "com.howdy",
"name": "some_schema",
"type": "record",
"fields": [ { "name":"string1","type":"string"}]
}'
) ;
Copy
方式二:加载Schema文件
CREATE TABLE embedded
COMMENT "这是表的注释"
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED as INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.url' = 'file:///path/to/the/schema/embedded.avsc'
) ;
Copy
建表语法
方式一:指定类型和加载Schema文件
create external table one_make_ods_test. ciss_base_areas
comment '行政地理区域表'
PARTITIONED BY ( dt string)
stored as avro
location '/test/full_imp/ciss4.ciss_base_areas'
TBLPROPERTIES ( 'avro.schema.url' = '/test/avsc/CISS4_CISS_BASE_AREAS.avsc' ) ;
Copy
方式二:指定解析类和加载Schema文件
create external table one_make_ods_test. ciss_base_areas
comment '行政地理区域表'
PARTITIONED BY ( dt string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
location '/test/full_imp/ciss4.ciss_base_areas'
TBLPROPERTIES ( 'avro.schema.url' = '/test/avsc/CISS4_CISS_BASE_AREAS.avsc' ) ;
Copy
-- 建表语句:
create external table 数据库名称.表名
comment '表的注释'
partitioned by (dt string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
location '这张表在HDFS上的路径'
TBLPROPERTIES ('这张表的Schema文件在HDFS上的路径')
-- 声明分区:
alter table 表名 add partition if not exists partition(key=value)
Copy
加载Sqoop生成的Avro的Schema文件,实现自动化建表
mkdir ~/.pip
vim ~/.pip/pip.conf
C : \Users\tang\pip\pip.ini
[global]
index-url = http://mirrors.aliyun.com/pypi/simple/
[install]
trusted-host = mirrors.aliyun.com
pip3 install sasl
pip3 install thrift
pip3 install thrift-sasl
pip3 install cx-Oracle
pip3 install pyhive
conda install sasl thrift thrift-sasl cx-Oracle pyhive
Copy
- `datatohive`
- CHiveTableFromOracleTable.py:用于创建Hive数据库、以及获取Oracle表的信息创建Hive表等
- CreateMetaCommon.py:定义了建表时固定的一些字符串数据,数据库名称、分层名称、文件类型属性等
- CreateHiveTablePartition.py:用于手动申明ODS层表的分区元数据
- LoadData2DWD.py:用于实现将ODS层的数据insert到DWD层表中
- `fileformat`
- AvroTableProperties.py:Avro文件格式对象,用于封装Avro建表时的字符串
- OrcTableProperties.py:Orc文件格式对象,用于封装Orc建表时的字符串
- OrcSnappyTableProperties.py:Orc文件格式加Snappy压缩的对象
- TableProperties.py:用于获取表的属性的类
- `entity`
- TableMeta.py:Oracle表的信息对象:用于将表的名称、列的信息、表的注释进行封装
- ColumnMeta.py:Oracle列的信息对象:用于将列的名称、类型、注释进行封装
- `utils`
- OracleHiveUtil.py:用于获取Oracle连接、Hive连接
- FileUtil.py:用于读写文件,获取所有Oracle表的名称
- TableNameUtil.py:用于将全量表和增量表的名称放入不同的列表中
- ConfigLoader.py:用于加载配置文件,获取配置文件信息
- OracleMetaUtil.py:用于获取Oracle中表的信息:表名、字段名、类型、注释等
- `EntranceApp.py`:程序运行入口,核心调度运行的程序
- `resource`
- config.txt:Oracle、Hive、SparkSQL的地址、端口、用户名、密码配置文件
- `config`
- common.py:用于获取日志的类
- settings.py:用于配置日志记录方式的类
- `log`
- itcast.log:日志文件
Copy
修改配置
tableList = FileUtil.readFileContent("D:\\git-python\\大数据-Spark\\py\\one_make\\resources\\tablenames.txt")
config.read('D : \\git-python\\大数据-Spark\\py\\one_make\\resources\\config.txt')
buffer.append(" location '/test/" + CreateMetaCommon.getDynamicDir(dbName,dynamicDir) + "/" + userName + tableName + "'")
buffer.append("') location '/test/" + CreateMetaCommon.getDynamicDir(dbName,dynamicDir) + "/ciss4.")
Copy
运行
D : /miniforge3/python.exe d:/git-python/大数据-Spark/py/one_make/EntranceApp.py
---- ODS层与DWD层建表示例
-- ODS层建表:基于avsc文件指定Schema建表
create external table if not exists one_make_ods.ciss_base_areas
partitioned by (dt string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
tblproperties ('avro.schema.url'='hdfs:///test/avsc/CISS4_CISS_BASE_AREAS.avsc')
location '/test/full_imp/ciss4.ciss_base_areas'
-- DWD层建表:自己指定每个字段的Schema建表
create external table if not exists one_make_dwd.ciss_base_areas(
ID string,
AREANAME string,
PARENTID string,
SHORTNAME string,
LNG string,
LAT string,
RANK bigint,
POSITION string,
SORT bigint
) partitioned by (dt string)
stored as orc
location '/test/dwd/ciss_base_areas';
-- 将ODS层中每张表的数据抽取到DWD层对应的数据表中
insert overwrite table dwd.tbname partition(dt = '20210101')
select
字段1,
字段2,
字段3,
……
from ods.tbname
where dt = '20210101';
Copy