赞
踩
用户画像项目有两个核心内容:用户画像标签和用户one_id。
用户画像标签体系,是波士顿咨询公司的团队帮忙搭建的,而我本人则是负责one_id的设计和生成。one_id,是用户唯一标识,用于海量数据中识别出同一个人。目前市面上有两种one_id:无中生有的one_id和基于现有用户数据生成的one_id。后者是前者的子集。
无中生有的one_id,就是基于用户访问数据:
【难点】:无中生有的one_id的技术难点在于如何获取用户设备号,可是现在很多设备商都不让你获取设备号了。
而我们用户画像项目组的one_id,则是基于现有的用户信息,去生成的one_id。
【难点】:这个项目的难点在于,如何将多条用户数据识别成同一个人。很多人马上就想到了,通过身份证号,那没有身份证号的呢,怎么识别成同一个人?没有身份证号,那咱们就给他造一个“身份证号”——phone_id,具体怎么操作,请继续往下看。
one_id维度表表结构
CREATE TABLE IF NOT EXISTS dw.dim_one_id( sk_id bigint COMMENT "自增主键" ,phone string COMMENT "该表唯一手机号" ,one_id bigint COMMENT "用户唯一标识" ,nature string COMMENT "属性:个人/组织" ,one_id_role string COMMENT "one-id角色:线索、潜客、车主" ,user_name_first string COMMENT "同一手机号取表优先级最高的名字" ,user_name string COMMENT "同一证件/手机号名字频次最高的名字" ,sex string COMMENT "性别" ,user_license string COMMENT "证件号" ,user_birth string COMMENT "出生日期" ,user_email string COMMENT "邮件地址" ,user_qq string COMMENT "QQ" ,user_wx string COMMENT "微信" ,user_addr string COMMENT "住址" ,source_list string COMMENT "数据源列表" ,source_pd_list string COMMENT "主键列表" ) COMMENT 'dw层--one_id维度表' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036' STORED AS PARQUET ;
one_id的目的就是,通过one_id去触达更多的客户。通过什么去触达呢?手机号。所以one_id的本质就是“电话簿”。
one_id的基本假设,其实就是从one_id的本质抽象过来的,one_id的基本假设就是“电话簿”的特征。
one_id的基本假设,同时也是one_id的数据清洗规则和融合逻辑。
假如同一个手机号,该用户在“懂车帝”上留的用户名是“妮可罗宾小可爱”,而他在购车(实销订单)的时候留的用户名是“李建钢”。你觉得,他在哪里留的信息可信度比较高?那肯定是购车的时候!
所以基于信息可信度,我会给数据来源表,分优先级1、2、3…数字越小,可信度越高。而我给优先级的逻辑是这样:
注:排了表的优先级,一定要跟业务方同步和调整。
CREATE TABLE IF NOT EXISTS ods.etl_one_id1 ( source STRING COMMENT '数据源' , source_level INT COMMENT '表优先级' , source_pd STRING COMMENT '该条数据在源数据的主键' , user_name STRING COMMENT '姓名' , sex STRING COMMENT '性别' , phone1 STRING COMMENT '手机号1' , phone2 STRING COMMENT '手机号2' , user_license STRING COMMENT '证件号' , user_birth STRING COMMENT '出生日期' , user_email STRING COMMENT '邮件地址' , user_qq STRING COMMENT 'QQ' , user_wx STRING COMMENT '微信' , user_addr STRING COMMENT '住址' , etl_dt STRING COMMENT 'ETL时间' ) PARTITIONED BY (TBL_NAME STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036' STORED AS PARQUET ;
CREATE TABLE ODS.ETL_ONE_ID1A AS SELECT * FROM ( SELECT source , source_level , source_pd , user_name , sex , phone1 , phone2 , user_license , user_birth , user_email , user_qq , user_wx , user_addr ,row_number() over (partition by source,source_pd order by COALESCE(phone1 ,phone2) desc) as rn FROM ODS.ETL_ONE_ID1 where phone1 is not null OR phone2 IS NOT NULL ) t WHERE RN=1 ;
CREATE TABLE IF NOT EXISTS dw.dim_table_level (
source STRING COMMENT '数据源--表名'
, source_level INT COMMENT '表优先级'
)
COMMENT 'dw层--来源表优先级 维度表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
INSERT INTO TABLE dw.dim_table_level VALUES
('某4S店销售订单表',2)
,('某汽车APP车主认证表',1)
,('某4S店访客登记表',3)
,('懂车帝用户表',4)
,('易车网用户表',5)
;
--源表给定【表优先顺序】 CREATE TABLE ODS.ETL_ONE_ID1B AS SELECT A.source , B.source_level , A.source_pd , A.user_name , A.sex , A.phone1 , A.phone2 , A.user_license , A.user_birth , A.user_email , A.user_qq , A.user_wx , A.user_addr FROM ods.ETL_ONE_ID1A A LEFT JOIN dw.dim_table_level B ON A.source=B.source ;在这里插入代码片
CREATE TABLE ODS.ETL_ONE_ID2 AS SELECT source ,source_level ,IF(LENGTH(source_pd)=0,NULL,source_pd) AS source_pd ,IF(LENGTH(user_name)=0,NULL,user_name) AS user_name ,IF(LENGTH(sex)=0,NULL,sex) AS sex ,IF(LENGTH(phone1)=0,NULL,phone1) AS phone1 ,IF(LENGTH(phone2)=0,NULL,phone2) AS phone2 ,IF(LENGTH(user_license)=0,NULL,user_license) AS user_license ,IF(LENGTH(user_birth)=0,NULL,user_birth) AS user_birth ,IF(LENGTH(user_email)=0,NULL,user_email) AS user_email ,IF(LENGTH(user_qq)=0,NULL,user_qq) AS user_qq ,IF(LENGTH(user_wx)=0,NULL,user_wx) AS user_wx ,IF(LENGTH(user_addr)=0,NULL,user_addr) AS user_addr FROM ODS.ETL_ONE_ID1B ;
CREATE TABLE ODS.ETL_ONE_ID3 AS SELECT source ,source_level ,regexp_replace(source_pd,'[\\s]+|[\\u3000]+|[\,]','') AS source_pd ,regexp_replace(user_name,'[\\s]+|[\\u3000]+|[\,]','') AS user_name ,regexp_replace(sex,'[\\s]+|[\\u3001]+|[\,]','') AS sex ,regexp_replace(phone1,"[\s+\!\/_,$%^*(+\"\')]+|[\\s]+|[\\u3002]+|[\,]+|[\\\u4E00-\\\u9FA5]+|[::+——()?【】“”!,。?、~@#¥%……&*().-]+", '') AS phone1 ,regexp_replace(phone2,"[\s+\!\/_,$%^*(+\"\')]+|[\\s]+|[\\u3002]+|[\,]+|[\\\u4E00-\\\u9FA5]+|[::+——()?【】“”!,。?、~@#¥%……&*().-]+", '') AS phone2 ,regexp_replace(user_license,'[\\s]+|[\\u3004]+|[\,]','') AS user_license ,regexp_replace(user_birth,'[\\s]+|[\\u3005]+|[\,]','') AS user_birth ,regexp_replace(user_email,'[\\s]+|[\\u3006]+|[\,]','') AS user_email ,regexp_replace(user_qq,'[\\s]+|[\\u3007]+|[\,]','') AS user_qq ,regexp_replace(user_wx,'[\\s]+|[\\u3008]+|[\,]','') AS user_wx ,regexp_replace(user_addr,'[\\s]+|[\\u3009]+|[\,]','') AS user_addr FROM ODS.ETL_ONE_ID2 ;
CREATE TABLE ODS.ETL_ONE_ID4 AS SELECT source ,source_level ,source_pd ,CASE WHEN user_name IS NOT NULL THEN regexp_extract(user_name,'([\\\u4E00-\\\u9FA5]+)',1) WHEN LENGTH(user_name)=0 THEN NULL ELSE NULL END user_name ,sex ,CASE WHEN phone1 IS NOT NULL AND phone1 regexp "^(010|02\\d|0[3-9]\\d{2})?\\d{6,8}$"=TRUE THEN phone1 --座机号清洗 WHEN phone1 IS NOT NULL AND phone1 regexp "^(010|02\\d|0[3-9]\\d{2})\\-?\\d{6,8}$"=TRUE THEN phone1 --座机号清洗 WHEN phone1 IS NOT NULL AND phone1 regexp "^1[3-9]\\d{9}$"=TRUE THEN phone1 --手机号清洗 WHEN phone1 IS NOT NULL AND phone1 regexp "^0+1[3-9]\\d{9}$"=TRUE THEN substr(phone1,-11) --手机号清洗 ELSE NULL END AS phone1 ,CASE WHEN phone2 IS NOT NULL AND phone2 regexp "^(010|02\\d|0[3-9]\\d{2})?\\d{6,8}$"=TRUE THEN phone2 --座机号清洗 WHEN phone2 IS NOT NULL AND phone2 regexp "^(010|02\\d|0[3-9]\\d{2})\\-?\\d{6,8}$"=TRUE THEN phone2 --座机号清洗 WHEN phone2 IS NOT NULL AND phone2 regexp "^1[3-9]\\d{9}$"=TRUE THEN phone2 --手机号清洗 WHEN phone2 IS NOT NULL AND phone2 regexp "^0+1[3-9]\\d{9}$"=TRUE THEN substr(phone2,-11) --手机号清洗 ELSE NULL END AS phone2 ,user_license ,CASE WHEN user_birth > ADD_MONTHS(CURRENT_DATE,-204) or user_birth <ADD_MONTHS(CURRENT_DATE,-1200) ---年龄大于100岁 或 小于17岁置为null THEN NULL ELSE SUBSTR(user_birth,0,10) END AS user_birth ,user_email ,CASE WHEN user_qq regexp('([0-9]{5,11})')=true then regexp_extract(user_qq,'([0-9]{5,11})',1) ELSE NULL end AS user_qq ,CASE WHEN user_wx regexp('([a-zA-Z]{1}[-_a-zA-Z0-9]{5,19})')=TRUE OR user_wx regexp('(([0][3-9]{1}[0-9]{9})|([1][3456789][0-9]{9}))')=TRUE OR user_wx regexp('([0-9]{5,11})')=true THEN regexp_extract(user_wx,'(([a-zA-Z]{1}[-_a-zA-Z0-9]{5,19})|([0][3-9]{1}[0-9]{9})|([1][3456789][0-9]{9})|([0-9]{5,11}))',1) ELSE NULL END AS user_wx ,user_addr FROM ODS.ETL_ONE_ID3 ;
注:以下代码包含了三个自定义函数,本文不进行具体展开:
CREATE TABLE ODS.ETL_ONE_ID5 AS SELECT source ,source_level ,source_pd ,IF(LENGTH(user_name)=0,NULL,user_name) AS user_name ,CASE WHEN LENGTH(user_license)=18 AND default.idennum(user_license) IS NOT NULL THEN IF(substr(user_license,17,1)%2 = 0,'女','男')--18位身份证提取性别 WHEN LENGTH(user_license)=15 AND default.idennum(user_license) IS NOT NULL THEN IF(substr(user_license,15,1)%2 = 0,'女','男') --15位身份证提取性别 ELSE sex END AS sex ,default.phones(phone1) AS phone1 ,default.phones(phone2) AS phone2 ,CASE when length(user_license)=18 AND user_license=regexp_extract(user_license,'(^[1-9][0-9]{5}[1-9][0-9]{3}((0[1-9])|(1[0-2]))((0[1-9])|([1|2][0-9])|(3[0|1]))[0-9Xx]{4})',1) THEN default.idennum(user_license) --18位身份证校验 when length(user_license)=15 AND default.idennum(user_license) IS NOT NULL THEN user_license --15位身份证校验 when default.checks(user_license) is not null THEN default.checks(user_license) --企业代码验证 else null end as user_license ,CASE when length(user_license)=18 AND user_license=regexp_extract(user_license,'(^[1-9][0-9]{5}[1-9][0-9]{3}((0[1-9])|(1[0-2]))((0[1-9])|([1|2][0-9])|(3[0|1]))[0-9Xx]{4})',1) THEN SUBSTR(CAST(from_unixtime(UNIX_TIMESTAMP(substr(user_license,7, 8),'yyyyMMdd')) AS STRING),0,10)--18位身份证提取生日 when length(user_license)=15 AND default.idennum(user_license) IS NOT NULL THEN SUBSTR(CAST(from_unixtime(UNIX_TIMESTAMP(CONCAT('19',SUBSTR(user_license,7, 6)),'yyyyMMdd')) AS STRING),0,10) --15位身份证提取生日 ELSE user_birth END AS user_birth ,CASE WHEN user_email IS NOT NULL AND user_email=regexp_extract(user_email,'([a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+)',1) THEN user_email ELSE CONCAT(user_qq,'@qq.com') END AS user_email ,user_qq ,user_wx ,user_addr FROM ODS.ETL_ONE_ID4 ;
CREATE TABLE ODS.ETL_ONE_ID6 AS SELECT source ,source_level ,source_pd ,IF(LENGTH(user_name)=0,NULL,user_name) AS user_name --为了避免进行 【同源一机一人】的筛选被筛选掉 ,IF(LENGTH(sex)=0,NULL,sex) AS sex ,IF(LENGTH(phone1)=0,NULL,phone1) AS phone1 ,IF(LENGTH(phone2)=0,NULL,phone2) AS phone2 ,IF(LENGTH(user_license)=0,NULL,user_license) AS user_license ,IF(LENGTH(user_birth)=0,NULL,user_birth) AS user_birth ,IF(LENGTH(user_email)=0,NULL,user_email) AS user_email ,IF(LENGTH(user_qq)=0,NULL,user_qq) AS user_qq ,IF(LENGTH(user_wx)=0,NULL,user_wx) AS user_wx ,IF(LENGTH(user_addr)=0,NULL,user_addr) AS user_addr FROM ODS.ETL_ONE_ID5 ;
CREATE TABLE ODS.ETL_ONE_ID7 AS SELECT A.source ,A.source_level ,A.source_pd ,A.user_name ,A.sex ,A.phone ,A.user_license ,IF(A.user_birth > ADD_MONTHS(CURRENT_DATE,-204) or A.user_birth <ADD_MONTHS(CURRENT_DATE,-1200),NULL,A.user_birth) AS user_birth ---年龄大于100岁 或 小于17岁置为null ,A.user_email ,A.user_qq ,A.user_wx ,A.user_addr FROM ( SELECT source ,source_level ,source_pd ,user_name ,sex ,phone1 as phone ,user_license ,user_birth ,user_email ,user_qq ,user_wx ,user_addr FROM ODS.ETL_ONE_ID6 WHERE phone1 IS NOT NULL GROUP BY source ,source_level ,source_pd ,user_name ,sex ,phone1 ,user_license ,user_birth ,user_email ,user_qq ,user_wx ,user_addr UNION ALL SELECT source ,source_level ,source_pd ,user_name ,sex ,phone2 as phone ,user_license ,user_birth ,user_email ,user_qq ,user_wx ,user_addr FROM ODS.ETL_ONE_ID6 WHERE phone2 IS NOT NULL GROUP BY source ,source_level ,source_pd ,user_name ,sex ,phone2 ,user_license ,user_birth ,user_email ,user_qq ,user_wx ,user_addr ) A GROUP BY A.source ,A.source_level ,A.user_name ,A.source_pd ,A.sex ,A.phone ,A.user_license ,IF(A.user_birth > ADD_MONTHS(CURRENT_DATE,-204) or A.user_birth <ADD_MONTHS(CURRENT_DATE,-1200),NULL,A.user_birth) ---年龄大于100岁 或 小于17岁置为null ,A.user_email ,A.user_qq ,A.user_wx ,A.user_addr ;
CREATE TABLE ODS.tmp_source_phone_unique1 --同一数据源 【同源一机一人】 AS SELECT A.source ,A.source_level ,A.source_pd ,A.user_name ,A.sex ,A.phone ,A.user_license ,A.user_birth ,A.user_email ,A.user_qq ,A.user_wx ,A.user_addr FROM ODS.ETL_ONE_ID7 A JOIN ( SELECT * FROM ( SELECT phone ,source ,COUNT(DISTINCT user_name) AS num_uesr_name --同一个手机号,机主数量,基于用户名字 。风险:不同数源的用户名不同,从而会造成数据 ,COUNT(DISTINCT user_license) AS num_license --同一个手机号,机主数量,基于用户证件 FROM ODS.ETL_ONE_ID7 GROUP BY source ,phone ) B1 WHERE B1.num_uesr_name<=1 OR B1.num_license=1 --确保同一个数据源,【一机一人】,考虑到一个人只有一个称呼或不填 ) B ON A.phone=B.phone AND A.source=B.source ;
CREATE TABLE ODS.tmp_source_phone_unique --同一数据源 【同源一证一人】 AS SELECT source ,source_level ,source_pd ,user_name ,sex ,phone ,IF(num_phone_license>2,NULL,user_license) AS user_license ,user_birth ,user_email ,user_qq ,user_wx ,user_addr FROM ( SELECT source ,source_level ,source_pd ,user_name ,sex ,phone ,user_license ,COUNT(DISTINCT phone) OVER(PARTITION BY source,user_license ) AS num_phone_license --同一个数据源一个身份证号对应的手机号数量 ,user_birth ,user_email ,user_qq ,user_wx ,user_addr FROM ODS.tmp_source_phone_unique1 WHERE default.idennum(user_license) IS NOT NULL ) TT UNION ALL SELECT source ,source_level ,source_pd ,user_name ,sex ,phone ,user_license ,user_birth ,user_email ,user_qq ,user_wx ,user_addr FROM ODS.tmp_source_phone_unique1 WHERE default.idennum(user_license) IS NULL ;
CREATE TABLE IF NOT EXISTS dw.dim_phone_id(
phone_id string COMMENT "手机号标识"
,phone string COMMENT "手机号"
,times string COMMENT "手机号在全数据源中的频次"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
第二点原则呼应了“一个人可以有多个手机号”的基本假设。
第三点原则呼应了“一个手机号只能触达一个人”的基本假设。
CREATE TABLE ods.etl_phone_times --手机号频次表
AS
SELECT
phone
,COUNT( DISTINCT source) AS times
FROM
ods.tmp_source_phone_unique
GROUP BY phone
;
CREATE TABLE ods.etl_phone_id AS SELECT phone_id ,phone ,times FROM ( SELECT FIRST_VALUE(phone) OVER (partition by source,source_pd ORDER BY times DESC ) AS phone_id --同一数据源,同一主键 有两个不同手机号,以频次高的手机号作为一个用户标识 phone_id ,phone ,times FROM ( SELECT A.source ,A.source_pd ,A.phone ,B.times FROM ods.tmp_source_phone_unique A LEFT JOIN ods.etl_phone_times B ON A.phone=B.phone ) t011 WHERE phone regexp "^(010|02\\d|0[3-9]\\d{2})?\\d{6,8}$"=FALSE --座机号频次太高,故只取 非座机号去生成 phone_id UNION ALL SELECT phone AS phone_id --座机号作为主键一个用户标识 ,phone ,times FROM ( SELECT A.source ,A.source_pd ,A.phone ,B.times FROM ods.tmp_source_phone_unique A LEFT JOIN ods.etl_phone_times B ON A.phone=B.phone ) t011 WHERE phone regexp "^(010|02\\d|0[3-9]\\d{2})?\\d{6,8}$"=TRUE --筛选出座机号 ) t01 GROUP BY phone_id ,phone ,times ;
CREATE TABLE ods.etl_phone_id1 AS SELECT phone_id ,phone ,times FROM ( SELECT IF(num_phone_id>1,phone,phone_id) AS phone_id ,phone ,times FROM ( SELECT phone_id ,phone ,times ,COUNT(DISTINCT phone_id) OVER (PARTITION BY phone ) AS num_phone_id FROM ods.etl_phone_id ) t1 ) TT GROUP BY phone_id ,phone ,times ;
CREATE TABLE ods.etl_phone_id2 AS SELECT IF(num_phone>3,phone,phone_id) AS phone_id ,phone ,times FROM ( SELECT phone_id ,COUNT(DISTINCT phone) OVER(PARTITION BY phone_id ) AS num_phone ,phone ,times FROM ods.etl_phone_id1 ) TT ;
INSERT OVERWRITE TABLE dw.dim_phone_id
SELECT
IF(B.phone_id IS NULL,A.phone_id,B.phone_id) AS phone_id
,A.phone
,A.times
FROM
ods.etl_phone_id2 A
LEFT JOIN
dw.dim_phone_id B
ON A.phone=B.phone
;
目标表:ods.phone_fuse
CREATE TABLE IF NOT EXISTS ods.phone_fuse( phone_id string COMMENT "手机号识别成一个人" ,phone string COMMENT "手机号" ,phone_times string COMMENT "手机号在全数据源中的频次" ,user_name_first string COMMENT "基于表优先级取到的用户名" ,sex string COMMENT "性别" ,user_license string COMMENT "证件号" ,user_birth string COMMENT "出生日期" ,user_email string COMMENT "邮件地址" ,user_qq string COMMENT "QQ" ,user_wx string COMMENT "微信" ,user_addr string COMMENT "住址" ,source_list string COMMENT "数据源列表" ,source_pd_list string COMMENT "主键列表" ) COMMENT 'ods层--基于手机号基本信息融合表' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036' STORED AS PARQUET ;
假如同一个手机号,该用户在“懂车帝”上留的用户名是“妮可罗宾小可爱”,而他在购车(实销订单)的时候留的用户名是“李建钢”。你觉得,他在哪里留的信息可信度比较高?那肯定是购车的时候!
所以基于信息可信度,我会给数据来源表,分优先级1、2、3…数字越小,可信度越高。
有了这些条件,你怎么去补齐每个手机号的基本信息呢?我是这样操作的:
1、以dw.dim_phone_id作为主表,该表的phone是不重复的,作为ods.phone_fuse的主键。
SELECT
phone_id
,phone
,times AS phone_times
FROM
cdp_dw.dim_phone_id
2、其他字段如用户名、性别、证件号、生日等字段,我是这样取的:
以“用户名”取值为例:
基于【同源一机一人、同源一证一人】表ods.tmp_source_phone_unique,筛选出“用户名”不为null的数据
基于步骤1,用以下代码取出优先级最高的“用户名”
1. ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
2. where RN=1
SELECT
*
FROM
(
SELECT
phone
,user_name AS user_name_first
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN
,source
,source_pd
FROM
ods.tmp_source_phone_unique
WHERE user_name IS NOT NULL AND phone IS NOT NULL
) A21
WHERE A21.RN=1
3、融合完整代码:
INSERT OVERWRITE TABLE ods.phone_fuse SELECT A1.phone_id ,A1.phone ,A1.phone_times ,A2.user_name_first ,nvl(B.sex,'未知') AS sex ,C.user_license ,D.user_birth ,E.user_email ,F.user_qq ,G.user_wx ,H.user_addr ,CONCAT_WS(',',A2.source ,B.source , C.source , D.source , E.source , F.source , G.source , H.source ) AS source_list ,CONCAT_WS(',',A2.source_pd ,B.source_pd , C.source_pd , D.source_pd , E.source_pd , F.source_pd , G.source_pd , H.source_pd ) AS source_pd_list FROM ( SELECT phone_id ,phone ,times AS phone_times FROM cdp_dw.dim_phone_id ) A1 --手机号主表 【全源一机频次】表 LEFT JOIN ( SELECT * FROM ( SELECT phone ,user_name AS user_name_first ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN ,source ,source_pd FROM ods.tmp_source_phone_unique WHERE user_name IS NOT NULL AND phone IS NOT NULL ) A21 WHERE A21.RN=1 ) A2 --姓名 ON A1.phone=A2.phone LEFT JOIN ( SELECT * FROM ( SELECT phone ,sex ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN ,source ,source_pd FROM ods.tmp_source_phone_unique WHERE sex IS NOT NULL AND sex<>'未知' AND phone IS NOT NULL ) B1 WHERE B1.RN=1 ) B --性别 ON A1.phone=B.phone LEFT JOIN ( SELECT * FROM ( SELECT phone ,user_license ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN ,source ,source_pd FROM ods.tmp_source_phone_unique WHERE user_license IS NOT NULL AND phone IS NOT NULL ) C1 WHERE C1.RN=1 ) C --证件号 ON A1.phone=C.phone LEFT JOIN ( SELECT * FROM ( SELECT phone ,user_birth ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN ,source ,source_pd FROM ods.tmp_source_phone_unique WHERE user_birth IS NOT NULL AND phone IS NOT NULL ) D1 WHERE D1.RN=1 ) D --出生日期 ON A1.phone=D.phone LEFT JOIN ( SELECT * FROM ( SELECT phone ,user_email ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN ,source ,source_pd FROM ods.tmp_source_phone_unique WHERE user_email IS NOT NULL AND phone IS NOT NULL ) E1 WHERE E1.RN=1 ) E --邮件地址 ON A1.phone=E.phone LEFT JOIN ( SELECT * FROM ( SELECT phone ,user_qq ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN ,source ,source_pd FROM ods.tmp_source_phone_unique WHERE user_qq IS NOT NULL AND phone IS NOT NULL ) F1 WHERE F1.RN=1 ) F --qq ON A1.phone=F.phone LEFT JOIN ( SELECT * FROM ( SELECT phone ,user_wx ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN ,source ,source_pd FROM ods.tmp_source_phone_unique WHERE user_wx IS NOT NULL ) G1 WHERE G1.RN=1 ) G --微信 ON A1.phone=G.phone LEFT JOIN ( SELECT * FROM ( SELECT phone ,user_addr ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY source_level ) AS RN ,source ,source_pd FROM ods.tmp_source_phone_unique WHERE user_addr IS NOT NULL AND phone IS NOT NULL ) H1 WHERE H1.RN=1 ) H --地址 ON A1.phone=H.phone ;
CREATE TABLE IF NOT EXISTS ods.user_license_fuse( phone_id string COMMENT "手机号识别成一个人" ,phone string COMMENT "手机号" ,phone_times INT COMMENT "手机号在全数据源中的频次" ,user_name_first string COMMENT "同一手机号去表优先级最高的名字" ,user_name string COMMENT "同一证件/手机号名字频次最高的名字" ,sex string COMMENT "性别" ,user_license string COMMENT "证件号" ,user_birth string COMMENT "出生日期" ,user_email string COMMENT "邮件地址" ,user_qq string COMMENT "QQ" ,user_wx string COMMENT "微信" ,user_addr string COMMENT "住址" ,source_list string COMMENT "数据源列表" ,source_pd_list string COMMENT "主键列表" ) COMMENT 'ods层--基于证件号基本信息融合表' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036' STORED AS PARQUET ;
CREATE TABLE ods.user_license_name1
AS
SELECT
user_license
,user_name
,count(user_name) AS user_name_times1
FROM
ods.tmp_source_phone_unique
WHERE user_license IS NOT NULL AND default.idennum(user_license) IS NOT NULL
GROUP BY user_license
,user_name
;
2.基于phone_id去统计user_name的频次 ,同一phone_id取名字频次高的user_name
CREATE TABLE ods.user_name_times2 AS SELECT A.phone_id ,B.user_name ,SUM(user_name_times2 ) AS user_name_times2 FROM cdp_dw.dim_phone_id A LEFT JOIN ( SELECT phone ,user_name ,count(1) AS user_name_times2 FROM ods.tmp_source_phone_unique WHERE user_name IS NOT NULL AND default.idennum(user_license) IS NULL GROUP BY phone ,user_name ) B ON A.PHONE=B.PHONE GROUP BY A.phone_id ,B.user_name ;
CREATE TABLE ods.phone_id_sex AS SELECT A.phone_id ,B.sex ,SUM(num_sex) AS num_sex FROM cdp_dw.dim_phone_id A LEFT JOIN ( select phone ,sex ,count(1) AS num_sex from ods.tmp_source_phone_unique where sex is not null AND sex<>'未知' group by phone ,sex ) B ON A.PHONE=B.PHONE GROUP BY A.phone_id ,B.sex ;
INSERT OVERWRITE TABLE ods.user_license_fuse SELECT A.phone_id ,A.phone ,A.phone_times ,A.user_name_first ,A.user_name --同一证件号频次高的名字 ,CASE WHEN LENGTH(A.user_license)=18 THEN IF(substr(A.user_license,17,1)%2 = 0,'女','男')--18位身份证提取性别 WHEN LENGTH(A.user_license)=15 THEN IF(substr(A.user_license,15,1)%2 = 0,'女','男') --15位身份证提取性别 ELSE A.sex END AS sex ,A.user_license ,CASE when length(A.user_license)=18 THEN SUBSTR(CAST(from_unixtime(UNIX_TIMESTAMP(substr(A.user_license,7, 8),'yyyyMMdd')) AS STRING),0,10)--18位身份证提取生日 when length(A.user_license)=15 THEN SUBSTR(CAST(from_unixtime(UNIX_TIMESTAMP(CONCAT('19',SUBSTR(A.user_license,7, 6)),'yyyyMMdd')) AS STRING),0,10) --15位身份证提取生日 ELSE A.user_birth END AS user_birth ,IF(A.user_email IS NULL ,FIRST_VALUE(A.user_email) OVER (PARTITION BY A.user_license ORDER BY LENGTH(NVL(A.user_email ,'')) DESC) ,A.user_email )AS user_email ,IF(A.user_qq IS NULL ,FIRST_VALUE(A.user_qq ) OVER (PARTITION BY A.user_license ORDER BY LENGTH(NVL(A.user_qq ,'')) DESC) ,A.user_qq )AS user_qq ,IF(A.user_wx IS NULL ,FIRST_VALUE(A.user_wx) OVER (PARTITION BY A.user_license ORDER BY LENGTH(NVL(A.user_wx,'')) DESC) ,A.user_wx )AS user_wx ,IF(A.user_addr IS NULL ,FIRST_VALUE(A.user_addr) OVER (PARTITION BY A.user_license ORDER BY LENGTH(NVL(A.user_addr,'')) DESC) ,A.user_addr ) AS user_addr ,A.source_list ,A.source_pd_list FROM ( SELECT A.phone_id ,A.phone ,A.phone_times ,A.user_name_first ,B.user_name ,A.sex ,A.user_license ,A.user_birth ,A.user_email ,A.user_qq ,A.user_wx ,A.user_addr ,A.source_list ,A.source_pd_list FROM ( SELECT * FROM ods.phone_fuse WHERE default.idennum(user_license) IS NOT NULL ) A LEFT JOIN --同一证件号取频次最高的名字 ( SELECT * FROM ( SELECT user_license ,user_name ,row_number() over (partition by user_license order by user_name_times1 desc ) AS RN FROM ods.user_license_name1 ) B1 WHERE RN=1 ) B ON A.user_license=B.user_license ) A --有身份证,同一身份证号,user_name 取频次最高的频次、其他字段取不为空的 UNION ALL --【一人多机】信息融合:一个人的多个手机号信息进行融合 SELECT B.phone_id ,B.phone ,B.phone_times ,B.user_name_first ,B.user_name --同一手机号取频次高的名字 ,B.sex ,B.user_license ,IF(B.user_birth IS NULL ,FIRST_VALUE(B.user_birth) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_birth,'')) DESC) ,B.user_birth ) AS user_birth ,IF (B.user_email IS NULL ,FIRST_VALUE(B.user_email) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_email,'')) DESC) ,B.user_email ) AS user_email ,IF(B.user_qq IS NULL ,FIRST_VALUE(B.user_qq ) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_qq ,'')) DESC) ,B.user_qq ) AS user_qq ,IF (B.user_wx IS NULL ,FIRST_VALUE(B.user_wx ) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_wx ,'')) DESC) ,B.user_wx ) AS user_wx ,IF (B.user_addr IS NULL ,FIRST_VALUE(B.user_addr) OVER (PARTITION BY B.phone_id ORDER BY LENGTH(NVL(B.user_addr ,'')) DESC) ,B.user_addr ) AS user_addr ,B.source_list ,B.source_pd_list FROM ( SELECT A.phone_id ,A.phone ,A.phone_times ,A.user_name_first ,B.user_name ,NVL(C.sex,'未知') AS sex ,A.user_license ,A.user_birth ,A.user_email ,A.user_qq ,A.user_wx ,A.user_addr ,A.source_list ,A.source_pd_list FROM ( SELECT * FROM ods.phone_fuse WHERE default.idennum(user_license) IS NULL ) A --这个表已经确保表手机号是唯一的了 LEFT JOIN --同一phone_id取 频次最高的名字 ( SELECT * FROM ( SELECT phone_id ,user_name ,row_number() over (partition by phone_id order by user_name_times2 desc ) AS RN FROM ods.user_name_times2 ) B1 WHERE RN=1 ) B ON A.phone_id=B.phone_id LEFT JOIN --同一phone_id 取频次高的性别 ( SELECT * FROM ( SELECT phone_id ,sex ,row_number() over (partition by phone_id order by num_sex desc ) AS RN FROM ods.phone_id_sex ) B1 WHERE RN=1 ) C ON A.phone_id=C.phone_id ) B --无身份证号,同一phone_id频次高的名字和性别 ;
CREATE TABLE IF NOT EXISTS dw.dim_one_id_initialize(
one_id string COMMENT "ONE_ID"
,user_license string COMMENT "证件号/phone_id"
,phone string COMMENT "手机号/座机号"
,phone_times int COMMENT "手机号频次"
)
COMMENT 'dw层--one_id初始化表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
生成ond_id :最初one_id是基于手机号、座机号、身份证,进行hash()计算的
警惕:hash会碰撞(不同的取数,却计算出来同样的hash值)
碰撞概率:
为了获得唯一one_id,我们采用了简单的“凯撒加密”,“凯撒加密”java代码如下:
package cn.ysw.com.aspect; import cn.hutool.core.util.StrUtil; import java.util.HashMap; import java.util.Map; public class Test { private static Map<String,String> MAP = new HashMap<>(); static { MAP.put("0","3"); MAP.put("1","4"); MAP.put("2","5"); MAP.put("3","6"); MAP.put("4","7"); MAP.put("5","8"); MAP.put("6","9"); MAP.put("7","0"); MAP.put("8","1"); MAP.put("9","2"); MAP.put("x","y"); MAP.put("X","Z"); } public static String getId(String str) { if (StrUtil.isBlank(str)) { return ""; } char[] chars = str.toCharArray(); StringBuffer sb = new StringBuffer(); for (char a: chars) { String s = String.valueOf(a); String val = MAP.get(s); if (StrUtil.isBlank(val)) { sb.append(a); } else { sb.append(val); } } return sb.toString(); } public static void main(String[] args) { String id = getId(); System.out.println(id); } }
INSERT OVERWRITE TABLE dw.dim_one_id_initialize SELECT md5(A.user_license) AS ONE_ID ,A.user_license ,A.phone ,A.phone_times FROM ( SELECT user_license ,phone ,phone_times FROM ods.user_license_fuse WHERE default.idennum(user_license) IS NOT NULL --有身份证号的数据 UNION ALL SELECT phone_id AS user_license ,phone ,phone_times FROM ods.user_license_fuse WHERE default.idennum(user_license) IS NULL --无身份证号,有企业信用代码 的数据 ) A ;
–ONE-ID 角色表
–1:线索
–2:潜客
–3:车主
CREATE TABLE IF NOT EXISTS dw.dim_one_id_role(
phone string COMMENT "手机号"
,role_id bigint COMMENT "角色编码"
,role_name string COMMENT "角色名称"
)
COMMENT 'dw层--one_id角色表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036'
STORED AS PARQUET
;
这个角色判断,是按手机号来源表去判断的:
插入数据:one_id角色表
INSERT OVERWRITE TABLE dw.dim_one_id_role SELECT phone ,role_id ,role_name FROM ( SELECT phone ,CASE WHEN SOURCE IN ('易车网用户表','懂车帝用户表') THEN 1 WHEN SOURCE IN ('某4S店访客登记表') THEN 2 WHEN SOURCE IN ('某4S店销售订单表','某汽车APP车主认证表') THEN 3 ELSE NULL END AS role_id ,CASE WHEN SOURCE IN ('易车网用户表','懂车帝用户表') THEN '线索' WHEN SOURCE IN ('某4S店访客登记表') THEN '潜客' WHEN SOURCE IN ('某4S店销售订单表','某汽车APP车主认证表') THEN '车主' ELSE NULL END AS role_name FROM ods.tmp_source_phone_unique ) TT GROUP BY phone ,role_id ,role_name ;
SELECT
*
FROM
(
SELECT
phone
,role_id
,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY role_id desc) AS RN
,role_name AS one_id_role
FROM
dw.dim_one_id_role
) C1
WHERE C1.RN=1
CREATE TABLE IF NOT EXISTS dw.dim_one_id_all( sk_id bigint COMMENT "自增主键" ,phone string COMMENT "该表唯一手机号" ,one_id string COMMENT "用户唯一标识" ,nature string COMMENT "属性:个人/组织" ,one_id_role string COMMENT "one-id角色:线索、潜客、车主" ,user_name_first string COMMENT "同一手机号取表优先级最高的名字" ,user_name string COMMENT "同一证件/手机号名字频次最高的名字" ,sex string COMMENT "性别" ,user_license string COMMENT "证件号" ,user_birth string COMMENT "出生日期" ,user_email string COMMENT "邮件地址" ,user_qq string COMMENT "QQ" ,user_wx string COMMENT "微信" ,user_addr string COMMENT "住址" ,source_list string COMMENT "数据源列表" ,source_pd_list string COMMENT "主键列表" ,valid_start_dt string COMMENT "当前行生效日期" ,valid_end_dt string COMMENT "当前行失效日期" ) COMMENT 'dw层--one_id拉链表' PARTITIONED BY (PART_DAY STRING COMMENT '分区字段,当天系统时间') ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036' STORED AS PARQUET ;
SET hive.exec.dynamic.partition=true; SET hive.exec.dynamic.partition.mode=nonstrict; SET hive.strict.checks.type.safety=false; INSERT OVERWRITE TABLE dw.dim_one_id_all PARTITION(PART_DAY) SELECT ROW_NUMBER() OVER (ORDER BY CONCAT_WS('_',t1.phone,cast(t1.one_id as string))) + max_sk AS sk_id ,t1.phone ,t1.one_id ,t1.nature ,t1.one_id_role ,t1.user_name_first ,t1.user_name ,t1.sex ,t1.user_license ,t1.user_birth ,t1.user_email ,t1.user_qq ,t1.user_wx ,t1.user_addr ,t1.source_list ,t1.source_pd_list , DATE_FORMAT(CURRENT_DATE,'yyyyMMdd') AS valid_start_dt , '99991231' AS valid_end_dt ,CAST(CURRENT_DATE AS STRING) AS PART_DAY FROM ( SELECT B.ONE_ID ,A.phone ,CASE WHEN default.checks(A.user_license) IS NOT NULL AND A.user_birth IS NULL THEN '组织' WHEN SUBSTR(A.user_name_first,-1) IN ('司','处','局','馆') THEN '组织' WHEN SUBSTR(user_name,-1) IN ('司','处','局','馆') THEN '组织' ELSE '个人' END AS nature --one-id属性: 证件号经过 企业代码验证 不为null,且生日 为null(因为生日优先用证件号去判断),则判断 为"组织" ,C.one_id_role ,A.user_name_first ,A.user_name ,A.sex ,A.user_license ,A.user_birth ,A.user_email ,A.user_qq ,A.user_wx ,A.user_addr ,A.source_list ,A.source_pd_list FROM ods.user_license_fuse A LEFT JOIN dw.dim_one_id_initialize B --ONE_ID初始化表 ON A.phone = B.phone LEFT JOIN ( SELECT * FROM ( SELECT phone ,role_id ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY role_id desc) AS RN ,role_name AS one_id_role FROM dw.dim_one_id_role ) C1 WHERE C1.RN=1 ) C --角色表 ON A.phone=C.phone ) t1 --源数据 LEFT JOIN ( SELECT * FROM dw.dim_one_id_all WHERE valid_start_dt <= DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd') --只取有效的数据 AND valid_end_dt > DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd') AND PART_DAY=CAST(DATE_SUB(CURRENT_DATE,1) AS STRING) --取昨天的分区 )t2 ON t1.phone=t2.phone --获取维度表上次最大的外键,以便为新数据造新的外键 CROSS JOIN ( SELECT COALESCE(MAX(sk_id ),0) AS max_sk FROM dw.dim_one_id_all )dim_sk_tmp WHERE t2.sk_id is null OR t1.one_id !=t2.one_id OR t1.nature !=t2.nature OR t1.one_id_role != t2.one_id_role OR t1.user_name_first!=t2.user_name_first OR t1.user_name !=t2.user_name OR t1.sex !=t2.sex OR t1.user_license !=t2.user_license OR t1.user_birth !=t2.user_birth OR t1.user_email !=t2.user_email OR t1.user_qq !=t2.user_qq OR t1.user_wx !=t2.user_wx UNION ALL SELECT t3.sk_id ,t3.phone ,t3.one_id ,t3.nature ,t3.one_id_role ,t3.user_name_first ,t3.user_name ,t3.sex ,t3.user_license ,t3.user_birth ,t3.user_email ,t3.user_qq ,t3.user_wx ,t3.user_addr ,t3.source_list ,t3.source_pd_list ,t3.valid_start_dt ,CASE WHEN t3.one_id !=t4.one_id OR t3.nature !=t4.nature OR t3.one_id_role !=t4.one_id_role OR t3.user_name_first!=t4.user_name_first OR t3.user_name !=t4.user_name OR t3.sex !=t4.sex OR t3.user_license !=t4.user_license OR t3.user_birth !=t4.user_birth OR t3.user_email !=t4.user_email OR t3.user_qq !=t4.user_qq OR t3.user_wx !=t4.user_wx THEN DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd') ELSE t3.valid_end_dt END AS valid_end_dt ,CAST(CURRENT_DATE AS STRING) AS PART_DAY FROM ( SELECT * FROM dw.dim_one_id_all WHERE valid_start_dt <= DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd') --只取有效的数据 AND valid_end_dt > DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd') AND PART_DAY=CAST(DATE_SUB(CURRENT_DATE,1) AS STRING) --取昨天的分区 )t3 --维度表 LEFT JOIN ( SELECT B.ONE_ID ,A.phone ,CASE WHEN default.checks(A.user_license) IS NOT NULL AND A.user_birth IS NULL THEN '组织' WHEN SUBSTR(A.user_name_first,-1) IN ('司','处','局','馆') THEN '组织' WHEN SUBSTR(user_name,-1) IN ('司','处','局','馆') THEN '组织' ELSE '个人' END AS nature --one-id属性: 证件号经过 企业代码验证 不为null,且生日 为null(因为生日优先用证件号去判断),则判断 为"组织" ,C.one_id_role ,A.user_name_first ,A.user_name ,A.sex ,A.user_license ,A.user_birth ,A.user_email ,A.user_qq ,A.user_wx ,A.user_addr ,A.source_list ,A.source_pd_list FROM ods.user_license_fuse A LEFT JOIN dw.dim_one_id_initialize B --ONE_ID初始化表 ON A.phone = B.phone LEFT JOIN ( SELECT * FROM ( SELECT phone ,role_id ,ROW_NUMBER() OVER(PARTITION BY phone ORDER BY role_id desc) AS RN ,role_name AS one_id_role FROM dw.dim_one_id_role ) C1 WHERE C1.RN=1 --同个手机号有多个角色,取序列号最大的角色 ) C --角色表 ON A.phone=C.phone ) t4 --源数据 ON t3.phone=t4.phone UNION ALL -- SELECT t4.sk_id ,t4.phone ,t4.one_id ,t4.nature ,t4.one_id_role ,t4.user_name_first ,t4.user_name ,t4.sex ,t4.user_license ,t4.user_birth ,t4.user_email ,t4.user_qq ,t4.user_wx ,t4.user_addr ,t4.source_list ,t4.source_pd_list ,t4.valid_start_dt ,t4.valid_end_dt ,CAST(CURRENT_DATE AS STRING) AS PART_DAY FROM dw.dim_one_id_all t4 WHERE valid_end_dt <= DATE_FORMAT(DATE_SUB(CURRENT_DATE,1),'yyyyMMdd') AND PART_DAY=CAST(DATE_SUB(CURRENT_DATE,1) AS STRING) --取昨天的分区 ;
CREATE TABLE IF NOT EXISTS dw.dim_one_id( sk_id bigint COMMENT "自增主键" ,phone string COMMENT "该表唯一手机号" ,one_id string COMMENT "用户唯一标识" ,nature string COMMENT "属性:个人/组织" ,one_id_role string COMMENT "one-id角色:线索、潜客、车主" ,user_name_first string COMMENT "同一手机号取表优先级最高的名字" ,user_name string COMMENT "同一证件/手机号名字频次最高的名字" ,sex string COMMENT "性别" ,user_license string COMMENT "证件号" ,user_birth string COMMENT "出生日期" ,user_email string COMMENT "邮件地址" ,user_qq string COMMENT "QQ" ,user_wx string COMMENT "微信" ,user_addr string COMMENT "住址" ,source_list string COMMENT "数据源列表" ,source_pd_list string COMMENT "主键列表" ) COMMENT 'dw层--one_id维度表' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\036' STORED AS PARQUET ;
INSERT OVERWRITE TABLE dw.dim_one_id SELECT sk_id ,phone ,one_id ,nature ,one_id_role ,user_name_first ,user_name ,sex ,user_license ,user_birth ,user_email ,user_qq ,user_wx ,user_addr ,source_list ,source_pd_list FROM dw.dim_one_id_all WHERE valid_start_dt <= DATE_FORMAT(CURRENT_DATE,'yyyyMMdd') --只取有效的数据 AND valid_end_dt > DATE_FORMAT(CURRENT_DATE,'yyyyMMdd') AND PART_DAY=CAST(CURRENT_DATE AS STRING) ;
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。