加载中…
正文 字体大小:

hivemall: 基于hive udf实现机器学习算法

(2017-03-04 21:32:22)
标签:

机器学习

hive

hadoop

分类: 数据挖掘

hivemall

1. hivemall介绍

hivemall是基于hive udf的机器学习算法工具,在工业实践应用中非常方便,方便数据科学家快速构建机器学习模型原型,部署到实际应用中; 

hivemall: <wbr>基于hive <wbr>udf实现机器学习算法

2. hivemall应用

2.1 输入格式

(1)分类模型输入格式

feature格式 feature ::= or 

index 0表示 bias 变量 10:3.4 123:0.5 34567:0.231 也可以用文本表示变量索引, "height:1.5" "length:2.0" 数值型和离散型变量 数值型变量必须有index,select addfeatureindex(array(3,4.0,5)) from dual; 离散型变量可以省略权重,feature ::=

特征哈希,特征非常多,>16777216,特征变量是文本,文本较大,占用大量内存,则考虑使用特征哈希


-- feature is v0.3.2 or before
concat(mhash(extract_feature("xxxxxxx-yyyyyy-weight:55.3")), ":", extract_weight("xxxxxxx-yyyyyy-weight:55.3"))

-- feature is v0.3.2-1 or later
feature(mhash(extract_feature("xxxxxxx-yyyyyy-weight:55.3")), extract_weight("xxxxxxx-yyyyyy-weight:55.3"))

Label format in Binary Classification目标变量格式


 ::= 0 | 1

Label format in Multi-class Classification 

(2)回归模型输入格式  ::=

(3)帮助函数


select feature("weight", 55.0);
weight:55.0

select extract_feature("weight:55.0"), extract_weight("weight:55.0");
weight | 55.0

select feature_index(array("10:0.2","7:0.3","9"));
[10,7,9]

select 
  convert_label(-1), convert_label(1), convert_label(0.0f), convert_label(1.0f)
from 
  dual;
 0.0f | 1.0f | -1 | 1

数值型特征


创建稀疏数值特征
select quantitative_features(array("apple","value"),1,120.3);
["apple:1.0","value:120.3"]

离散型特征


创建离散型稀疏特征
select categorical_features(
  array("is_cat","is_dog","is_lion","is_pengin","species"),
  1, 0, 1.0, true, "dog"
);
["is_cat#1","is_dog#0","is_lion#1.0","is_pengin#true","species#dog"]

准备训练数据表


select 
  rowid() as rowid,
  concat_array(
    array("bias:1.0"),
    categorical_features( 
      array("id", "name"),
      id, name
    ),
    quantitative_features(
      array("height", "weight"),
      height, weight
    )
  ) as features, 
  click_or_not as label
from
  table;
2.2 特征工程

min-max规范化


select min(target), max(target)
from (
select target from e2006tfidf_train 
-- union all
-- select target from e2006tfidf_test 
) t;
-7.899578 -0.51940954
set hivevar:min_target=-7.899578;
set hivevar:max_target=-0.51940954;

create or replace view e2006tfidf_train_scaled 
as
select 
  rowid,
  rescale(target, ${min_target}, ${max_target}) as target, 
  features
from 
  e2006tfidf_train;

z-score


select avg(target), stddev_pop(target)
from (
select target from e2006tfidf_train 
-- union all
-- select target from e2006tfidf_test 
) t;
-3.566241460963296 0.6278076335455348
set hivevar:mean_target=-3.566241460963296;
set hivevar:stddev_target=0.6278076335455348;

create or replace view e2006tfidf_train_scaled 
as
select 
  rowid,
  zscore(target, ${mean_target}, ${stddev_target}) as target, 
  features
from 
  e2006tfidf_train;

特征哈希


select feature_hashing('aaa');
> 4063537

select feature_hashing('aaa','-features 3');
> 2

select feature_hashing(array('aaa','bbb'));
> ["4063537","8459207"]

select feature_hashing(array('aaa','bbb'),'-features 10');
> ["7","1"]

select feature_hashing(array('aaa:1.0','aaa','bbb:2.0'));
> ["4063537:1.0","4063537","8459207:2.0"]

select feature_hashing(array(1,2,3));
> ["11293631","3322224","4331412"]

select feature_hashing(array('1','2','3'));
> ["11293631","3322224","4331412"]

select feature_hashing(array('1:0.1','2:0.2','3:0.3'));
> ["11293631:0.1","3322224:0.2","4331412:0.3"]

select feature_hashing(features), features from training_fm limit 2;

> ["1803454","6630176"]   ["userid#5689","movieid#3072"]
> ["1828616","6238429"]   ["userid#4505","movieid#2331"]

select feature_hashing(array("userid#4505:3.3","movieid#2331:4.999", "movieid#2331"));

> ["1828616:3.3","6238429:4.999","6238429"]

tf-idf计算


定义宏函数
create temporary macro max2(x INT, y INT)
if(x>y,x,y);

-- create temporary macro idf(df_t INT, n_docs INT)
-- (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0);

create temporary macro tfidf(tf FLOAT, df_t INT, n_docs INT)
tf * (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0);

数据准备
create external table wikipage (
  docid int,
  page string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;

cd ~/tmp
wget https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv

LOAD DATA LOCAL INPATH '/home/myui/tmp/tfidf_test.tsv' INTO TABLE wikipage;

create or replace view wikipage_exploded
as
select
  docid, 
  word
from
  wikipage LATERAL VIEW explode(tokenize(page,true)) t as word
where
  not is_stopword(word);
  计算tf
create or replace view term_frequency 
as
select
  docid, 
  word,
  freq
from (
select
  docid,
  tf(word) as word2freq
from
  wikipage_exploded
group by
  docid
) t 
LATERAL VIEW explode(word2freq) t2 as word, freq
计算df
create or replace view document_frequency
as
select
  word, 
  count(distinct docid) docs
from
  wikipage_exploded
group by
  word;
-- set the total number of documents
select count(distinct docid) from wikipage;
set hivevar:n_docs=3;
计算tfidf
create or replace view tfidf
as
select
  tf.docid,
  tf.word, 
  -- tf.freq * (log(10, CAST(${n_docs} as FLOAT)/max2(1,df.docs)) + 1.0) as tfidf
  tfidf(tf.freq, df.docs, ${n_docs}) as tfidf
from
  term_frequency tf 
  JOIN document_frequency df ON (tf.word = df.word)
order by 
  tfidf desc;

docid  word     tfidf
1       justice 0.1641245850805637
3       knowledge       0.09484606645205085
2       action  0.07033910867777095
1       law     0.06564983513276658
1       found   0.06564983513276658
1       religion        0.06564983513276658
1       discussion      0.06564983513276658

转化为特征变量
select
  docid, 
  -- collect_list(concat(word, ":", tfidf)) as features -- Hive 0.13 or later
  collect_list(feature(word, tfidf)) as features -- Hivemall v0.3.4 & Hive 0.13 or later
  -- collect_all(concat(word, ":", tfidf)) as features -- before Hive 0.13
from 
  tfidf
group by
  docid;

特征向量化


select
  id,
  vectorize_features(
    array("age","job","marital","education","default","balance","housing","loan","contact","day","month","duration","campaign","pdays","previous","poutcome"), 
    age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
  ) as features,
  y
from
  train
limit 2;

> 1       ["age:39.0","job#blue-collar","marital#married","education#secondary","default#no","balance:1756.0","housing#yes","loan#no","contact#cellular","day:3.0","month#apr","duration:939.0","campaign:1.0","pdays:-1.0","poutcome#unknown"]   1
> 2       ["age:51.0","job#entrepreneur","marital#married","education#primary","default#no","balance:1443.0","housing#no","loan#no","contact#cellular","day:18.0","month#feb","duration:172.0","campaign:10.0","pdays:-1.0","poutcome#unknown"]   1
2.3 模型评估

auc


with data as (
  select 0.5 as prob, 0 as label
  union all
  select 0.3 as prob, 1 as label
  union all
  select 0.2 as prob, 0 as label
  union all
  select 0.8 as prob, 1 as label
  union all
  select 0.7 as prob, 1 as label
)
select auc(prob, label) as auc
from (
  select prob, label
  from data
  DISTRIBUTE BY floor(prob / 0.2)
  SORT BY prob DESC
) t;

precision recall


with truth as (
  select userid, collect_set(itemid) as truth
  from dummy_truth
  group by userid
),
rec as (
  select
    userid,
    map_values(to_ordered_map(score, itemid, true)) as rec,
    cast(count(itemid) as int) as max_k
  from dummy_rec
  group by userid
)
select
  -- rec = [1,3,2,6], truth = [1,2,4] for each user

  -- Recall@k
  recall(t1.rec, t2.truth, t1.max_k) as recall,
  recall(t1.rec, t2.truth, 2) as recall_at_2,

  -- Precision@k
  precision(t1.rec, t2.truth, t1.max_k) as precision,
  precision(t1.rec, t2.truth, 2) as precision_at_2,

  -- MAP
  average_precision(t1.rec, t2.truth, t1.max_k) as average_precision,
  average_precision(t1.rec, t2.truth, 2) as average_precision_at_2,

  -- AUC
  auc(t1.rec, t2.truth, t1.max_k) as auc,
  auc(t1.rec, t2.truth, 2) as auc_at_2,

  -- MRR
  mrr(t1.rec, t2.truth, t1.max_k) as mrr,
  mrr(t1.rec, t2.truth, 2) as mrr_at_2,

  -- NDCG
  ndcg(t1.rec, t2.truth, t1.max_k) as ndcg,
  ndcg(t1.rec, t2.truth, 2) as ndcg_at_2
from rec t1
join truth t2 on (t1.userid = t2.userid)
;

3. 机器学习模型

3.1 二分类

http://hivemall.incubator.apache.org/userguide/binaryclass/titanicrf.html http://hivemall.incubator.apache.org/userguide/regression/kddcup12tr2dataset.html

3.2 回归问题

http://hivemall.incubator.apache.org/userguide/regression/e2006_dataset.html

3.3 协同过滤

http://hivemall.incubator.apache.org/userguide/recommend/itembasedcf.html



0

阅读 评论 收藏 转载 喜欢 打印举报
已投稿到:
  • 评论加载中,请稍候...
发评论

    发评论

    以上网友发言只代表其个人观点,不代表新浪网的观点或立场。

      

    新浪BLOG意见反馈留言板 不良信息反馈 电话:4006900000 提示音后按1键(按当地市话标准计费) 欢迎批评指正

    新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 会员注册 | 产品答疑

    新浪公司 版权所有