use 库名; drop table if exists 表名; create table 表名 ( a string, b string, c int ) partitioned by (cal_dt string) row format delimited fields terminated by '\t';
select * from ( select * ,regexp_replace(get_json_object(base_info,'$.a'),'\\[|"|\\]','') as a ,regexp_replace(get_json_object(base_info,'$.b'),'\\[|"|\\]','') as b from 表名 )a lateral view explode(split(a,',')) b as a ;
json串解析 get_json_object(base_info,'$.a')
一行中多个值转成多行,即多值变成列值, lateral view explode(split(as,',')) b as a
brickhouse-*.*.*.jar包解析json串 create temporary function from_json as 'brickhouse.udf.json.FromJsonUDF'; from_json(items,'array<map<string,int>>'),解析出[{},{},...,{}]类型数据,访问则与map一致,map[keyname].
集合操作,把数据聚集,collect_set是对数据进行了去重操作后的聚集;如果需要不去重,则使用brickhouse-0.7.1.jar中的collect函数, ADD JAR ****/brickhouse-0.7.1.jar; create temporary function collect as 'brickhouse.udf.collect.CollectUDAF';
数组切分出
from tableName lateral view explode( from_json(para, 'array<map<string,string>>') ) x as item where 条件
create table tableName as select 1 as map_id , map('1','label1', '2','label2', '3','label3' ) as catalog_encode ;
因此需要做一个黑盒的函数,供使用者,是一个透明的;定义如下函数,
package 包名;
import java.util.HashMap; import java.util.Map;
import org.apache.hadoop.hive.ql.exec.UDF;
public class CatalogDedoce extends UDF{
public Map<String,Object> initMap(){ Map<String,Object> result = new HashMap<>(); map.put('1','label1'); map.put('2','label2'); map.put('3','label3'); return result; }
public int evaluate(List<Long> arguments,int spaceMinute,int hasNum,int totalNum){
if (arguments!=null && !arguments.isEmpty()){ int len = arguments.size();
if (len>totalNum){ return 3; } if (len>hasNum){ int start = 0; int end = start+hasNum; while (end<len){ if ((arguments.get(end)-arguments.get(start))/60<=spaceMinute){ return 2; } start = start + 1; end = start+hasNum; } } } return 0; } }
在hive中array<>类型对应java的List。
使用自定义的函数方式如下:
1 2 3 4
ADD JAR *****.jar; CREATE TEMPORARY FUNCTION 函数名 AS '包.类';