for (Result r : rs) {
System.out.println(\获得到rowkey:\ for (KeyValue keyValue : r.raw()) {
System.out.println(\列:\ + \值:\ } }
} catch (IOException e) { e.printStackTrace(); } }
/**
* 单条件查询,根据rowkey查询唯一一条记录 * @param tableName */
public static void QueryByCondition1(String tableName) {
HTablePool pool = new HTablePool(configuration, 1000); HTable table = (HTable) pool.getTable(tableName); try {
Get scan = new Get(\根据rowkey查询 Result r = table.get(scan);
System.out.println(\获得到rowkey:\ for (KeyValue keyValue : r.raw()) {
System.out.println(\列:\ + \值:\ }
} catch (IOException e) { e.printStackTrace(); } }
/**
* 单条件按查询,查询多条记录 * @param tableName */
public static void QueryByCondition2(String tableName) {
try {
HTablePool pool = new HTablePool(configuration, 1000); HTable table = (HTable) pool.getTable(tableName); Filter filter = new SingleColumnValueFilter(Bytes
.toBytes(\
.toBytes(\当列column1的值为aaa时进行查询 Scan s = new Scan(); s.setFilter(filter);
ResultScanner rs = table.getScanner(s); for (Result r : rs) {
System.out.println(\获得到rowkey:\ for (KeyValue keyValue : r.raw()) {
System.out.println(\列:\ + \值:\ } }
} catch (Exception e) { e.printStackTrace(); }
}
/**
* 组合条件查询 * @param tableName */
public static void QueryByCondition3(String tableName) {
try {
HTablePool pool = new HTablePool(configuration, 1000); HTable table = (HTable) pool.getTable(tableName);
List
Filter filter1 = new SingleColumnValueFilter(Bytes
.toBytes(\ .toBytes(\ filters.add(filter1);
Filter filter2 = new SingleColumnValueFilter(Bytes
.toBytes(\ .toBytes(\ filters.add(filter2);
Filter filter3 = new SingleColumnValueFilter(Bytes
.toBytes(\ .toBytes(\ filters.add(filter3);
FilterList filterList1 = new FilterList(filters);
Scan scan = new Scan(); scan.setFilter(filterList1);
ResultScanner rs = table.getScanner(scan); for (Result r : rs) {
System.out.println(\获得到rowkey:\ for (KeyValue keyValue : r.raw()) {
System.out.println(\列:\ + \值:\ } }
rs.close();
} catch (Exception e) { e.printStackTrace(); }
} }
实验名称:大数据综合案例
一 目的
1.掌握Hadoop大数据基本框架。 2.掌握MR核心编程。
3.掌握Hadoop生态组件使用。
二 内容
使用Hadoop框架完成日志的数据清理,分析。
三 步骤
1、先在本地创建一个gd.txt文件,然后把数据导入进去;
2、在hdfs上创建一个demo文件夹,然后在demo文件夹中创建t1文件夹(t1文件夹也可以不创建,可以只有demo一个文件夹),然后把ubuntu中的gd.txt文件导入到hdfs中的demo文件夹中的t1文件夹中,然后使用命令查看是否导入进去
3、进入hive中,创建表t1(ip、年、月、日、网址),然后使用命令查看表t1中的数据; (1)分别统计30,31号总流
select count(*) from t1 where day=30 select count(*) from t1 where day=31;
(2)分别统计30,31号所有IP数(去重) select distinct ip from t1 where day=30;
select distinct ip from t1 where day=31
(3)统计30,31号IP访问数为1的。
create table t2(ip String,ipcount int) row format delimited fields terminated by '\\t';
insert overwrite table t2 select ip,count(ip) ipcount from t1 where day=30 group by ip;
select * from t2;
select ip,ipcount from t2 where ipcount=1;
create table t3(ip String,ipcount int) row format delimited fields terminated by '\\t'; OK
Time taken: 0.247 seconds
insert overwrite table t3 select ip,count(ip) ipcount from t1 where day=31 group by ip; hive> select * from t3; OK
211.97.15.179 4 27.19.74.143 7 8.35.201.161 1 8.35.201.163 1 8.35.201.164 2 8.35.201.165 4
Time taken: 0.089 seconds, Fetched: 6 row(s) hive> select ip,ipcount from t3 where ipcount=1; OK
8.35.201.161 1 8.35.201.163 1
Time taken: 0.101 seconds, Fetched: 2 row(s)
(4)统计30,31号IP访问最高的。
select ip,ipcount from t2 order by ipcount desc limit 1; select ip,ipcount from t3 order by
四 结果
1.成功搭建Hadoop生态环境,用于海量数据分析。 2.对某日志的hive数据清洗分析。
五 疑难
1.在最开始的不太懂MR的原理。 2.搭建hive的时候出现各种的错误。
六:算法
(1)分别统计30,31号总流
select count(*) from t1 where day=30 select count(*) from t1 where day=31;
(2)分别统计30,31号所有IP数(去重) select distinct ip from t1 where day=30; select distinct ip from t1 where day=31 (3)统计30,31号IP访问数为1的。
create table t2(ip String,ipcount int) row format delimited fields terminated by '\\t';
insert overwrite table t2 select ip,count(ip) ipcount from t1 where day=30 group by ip;
select * from t2;
select ip,ipcount from t2 where ipcount=1;
create table t3(ip String,ipcount int) row format delimited fields terminated by '\\t'; OK
Time taken: 0.247 seconds
insert overwrite table t3 select ip,count(ip) ipcount from t1 where day=31 group by ip; hive> select * from t3; OK
211.97.15.179 4 27.19.74.143 7 8.35.201.161 1 8.35.201.163 1 8.35.201.164 2 8.35.201.165 4
Time taken: 0.089 seconds, Fetched: 6 row(s) hive> select ip,ipcount from t3 where ipcount=1; OK
8.35.201.161 1 8.35.201.163 1
Time taken: 0.101 seconds, Fetched: 2 row(s) (4)统计30,31号IP访问最高的。
select ip,ipcount from t2 order by ipcount desc limit 1; select ip,ipcount from t3 order by
非关系数据库
实验名称:HBase的安装与配置
一 目的
1. 掌握HBase完全分布式的安装方法; 2. 验证HBase完全分布式的安装;
3. 打开Web UI管理界面验证HBase的安装; 4. 打开HBase Shell验证测试安装环境。
二 内容