作者:oschina 来源:开源中国 时间:2018-04-16 21:34:57 我要评论

hive数据压缩 snappy

mkdir 2.5.0-native-snappy
tar -zxvf 2.5.0-native-snappy.tar.gz -C 2.5.0-native-snappy
cd hadoop-2.5.0/lib
#改变原来的native
mv native/ 250native
mkdir native
cp /home/soft/2.5.0-native-snappy/* native/
bin/hadoop checknative
#结果如下
18/04/16 12:30:25 INFO bzip2.Bzip2Factory: Successfully loaded  amp; initialized native-bzip2 library system-native
18/04/16 12:30:25 INFO zlib.ZlibFactory: Successfully loaded  amp; initialized native-zlib library
Native library checking:
hadoop: true /opt/modules/hadoop-2.5.0/lib/native/libhadoop.so
zlib:   true /lib64/libz.so.1
snappy: true /opt/modules/hadoop-2.5.0/lib/native/libsnappy.so.1
lz4:    true revision:99
bzip2:  true /lib64/libbz2.so.1

运行一个mapreduce程序

 #在hdfs上创建文件
 bin/hdfs dfs -mkdir -p /user/jianxin/mapreduce/wordcount/input
 #上传文件
 bin/hdfs dfs -put /home/datas/mc.input  /user/jianxin/mapreduce/wordcount/input

 bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount /user/jianxin/mapreduce/wordcount/input  /user/jianxin/mapreduce/wordcount/output
 
bin/hdfs dfs -cat  /user/jianxin/mapreduce/wordcount/output/part-r-00000
#压缩形式
bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount -D mapreduce.map.output.compress=true -D mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.SnappyCodec /user/jianxin/mapreduce/wordcount/input /user/jianxin/mapreduce/wordcount/output22
#查看历史记录服务器
http://hadoop.jianxin.com:19888/jobhistory

hive数据表的存储

  • TEXTFILE

create table page_views(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't'
STORED AS TEXTFILE ;

hive (default) gt; load data local  inpath  '/home/datas/page_views.data' into table  page_views;

#查看文件的大小
hive (default) gt; dfs -du -h /user/hive/warehouse/page_views

18.1 M  /user/hive/warehouse/page_views/page_views.data
  • orc
create table page_views_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't'
STORED AS orc ;


hive (default) gt; load data local  inpath  '/home/datas/page_views.data' into table  page_views_orc;
#或者
insert into table page_views_orc select * from page_views ;

hive (default) gt; dfs -du -h /user/hive/warehouse/page_views_orc

2.6 M  /user/hive/warehouse/page_views_orc/000000_0
  • parquet
create table page_views_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't'
STORED AS PARQUET ;

load data local  inpath  '/home/datas/page_views.data' into table  page_views_parquet;

dfs -du -h /user/hive/warehouse/page_views_parquet
18.1 M  /user/hive/warehouse/page_views_parquet/page_views.data
  • snappy
create table page_views_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");

load data local  inpath  '/home/datas/page_views.data' into table  page_views_orc_snappy;

dfs -du -h /user/hive/warehouse/page_views_orc_snappy

18.1 M  /user/hive/warehouse/page_views_orc_snappy/page_views.data


create table page_views_orc_none(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't'
STORED AS orc tblproperties ("orc.compress"="NONE");

load data local  inpath  '/home/datas/page_views.data' into table  page_views_orc_none;

dfs -du -h /user/hive/warehouse/page_views_orc_none

hive (default) gt; dfs -du -h /user/hive/warehouse/page_views_orc_none;
18.1 M  /user/hive/warehouse/page_views_orc_none/page_views.data


set parquet.compression=SNAPPY ;
create table page_views_parquet_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't'
STORED AS parquet;
insert into table page_views_parquet_snappy select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_parquet_snappy/ ;

hive (default) gt; dfs -du -h /user/hive/warehouse/page_views_parquet_snappy/ ;
6.4 M  /user/hive/warehouse/page_views_parquet_snappy/000000_0

总结

在实际的项目开发当中,hive表的数据,存储格式选用orcfile 或者 qarquet 数据压缩采用snappy。

hive企业优化



文章转载自 开源中国社区 [http://www.oschina.net]

本文地址:https://my.oschina.net/u/3798913/blog/1796433

8阅读 | 0评论
你的回应
写文章

联系我们