HBase Observer中ES增加创建IK mapping

ES中的中文分词支持改为用IK分词
在调用java api时,需要指定字段使用IK分词创建mapping
同时ES还从原来使用的BulkRequestBuilder,改成参数更多更灵活的BulkProcessor。

1.原来的ElasticSearchOperator

package com.xxx.data;


import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.delete.DeleteRequestBuilder;
import org.elasticsearch.action.update.UpdateRequestBuilder;
import org.elasticsearch.client.Client;

import java.util.HashMap;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

//import org.elasticsearch.client.transport.TransportClient;
//import org.elasticsearch.common.settings.ImmutableSettings;
//import org.elasticsearch.common.settings.Settings;
//import org.elasticsearch.common.transport.InetSocketTransportAddress;

public class ElasticSearchOperator {

    // 缓冲池容量
    private static final int MAX_BULK_COUNT = 10;
    // 最大提交间隔(秒)
    private static final int MAX_COMMIT_INTERVAL = 60 * 5;

    private static Client client = null;
    private static BulkRequestBuilder bulkRequestBuilder = null;

    private static Lock commitLock = new ReentrantLock();

    static {

        // elasticsearch1.5.0
//        Settings settings = ImmutableSettings.settingsBuilder()
//                .put("cluster.name", Config.clusterName).build();
//        client = new TransportClient(settings)
//                .addTransportAddress(new InetSocketTransportAddress(
//                        Config.nodeHost, Config.nodePort));

        // 2.3.5
        client = MyTransportClient.client;

        bulkRequestBuilder = client.prepareBulk();
        bulkRequestBuilder.setRefresh(true);

        Timer timer = new Timer();
        timer.schedule(new CommitTimer(), 10 * 1000, MAX_COMMIT_INTERVAL * 1000);
    }

    /**
     * 判断缓存池是否已满,批量提交
     *
     * @param threshold
     */
    private static void bulkRequest(int threshold) {
        if (bulkRequestBuilder.numberOfActions() > threshold) {
            BulkResponse bulkResponse = bulkRequestBuilder.execute().actionGet();
            if (!bulkResponse.hasFailures()) {
                bulkRequestBuilder = client.prepareBulk();
            }
        }
    }

    /**
     * 加入索引请求到缓冲池
     *
     * @param builder
     */
    public static void addUpdateBuilderToBulk(UpdateRequestBuilder builder) {
        commitLock.lock();
        try {
            bulkRequestBuilder.add(builder);
            bulkRequest(MAX_BULK_COUNT);
        } catch (Exception ex) {
            ex.printStackTrace();
        } finally {
            commitLock.unlock();
        }
    }

    /**
     * 加入删除请求到缓冲池
     *
     * @param builder
     */
    public static void addDeleteBuilderToBulk(DeleteRequestBuilder builder) {
        commitLock.lock();
        try {
            bulkRequestBuilder.add(builder);
            bulkRequest(MAX_BULK_COUNT);
        } catch (Exception ex) {
            ex.printStackTrace();
        } finally {
            commitLock.unlock();
        }
    }

    /**
     * 定时任务,避免RegionServer迟迟无数据更新,导致ElasticSearch没有与HBase同步
     */
    static class CommitTimer extends TimerTask {
        @Override
        public void run() {
            commitLock.lock();
            try {
                bulkRequest(0);
            } catch (Exception ex) {
                ex.printStackTrace();
            } finally {
                commitLock.unlock();
            }
        }
    }

    private static void test() {
        Config.indexName = "flume-2016-08-10";
        Config.typeName = "tweet";
        for (int i = 10; i < 20; i++) {
            Map<String, Object> json = new HashMap<String, Object>();
            json.put("field", "ttt");
            //添加
//            addUpdateBuilderToBulk(client.prepareUpdate(Config.indexName, Config.typeName, String.valueOf(i)).setDoc(json).setUpsert(json));
            //删除
            addDeleteBuilderToBulk(client.prepareDelete(Config.indexName, Config.typeName, String.valueOf(i)));
        }

        System.out.println(bulkRequestBuilder.numberOfActions());
    }

    public static void main(String[] args) {
        test();
    }
}

2.改成ElasticSearchBulkProcessor

package com.xxx.data;

import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequest;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.action.bulk.BackoffPolicy;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.Requests;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;

import java.util.*;

/**
 * Created by lisiyu on 16/9/19.
 */
public class ElasticSearchBulkProcessor {

    private static Client client = null;
    private static BulkProcessor bulkProcessor = null;

    // 缓冲池容量(计数,request)
    private static final int MAX_BULK_COUNT = 1000;
    // 缓冲池容量(大小,MB)
    private static final int MAX_BULK_SIZE = 1024;
    // 最大提交间隔(秒)
    private static final int MAX_COMMIT_INTERVAL = 60 * 1;
    // 最大并发数量
    private static final int MAX_CONCURRENT_REQUEST = 2;
    // 失败重试等待时间 (ms)
    private static final int REJECT_EXCEPTION_RETRY_WAIT = 500;
    // 失败重试次数
    private static final int REJECT_EXCEPTION_RETRY_TIMES = 3;

    static {
        // 2.3.5
        client = MyTransportClient.client;

        bulkProcessor = BulkProcessor.builder(
                client,
                new BulkProcessor.Listener() {
                    @Override
                    public void beforeBulk(long executionId,
                                           BulkRequest request) {  }

                    @Override
                    public void afterBulk(long executionId,
                                          BulkRequest request,
                                          BulkResponse response) {  }

                    @Override
                    public void afterBulk(long executionId,
                                          BulkRequest request,
                                          Throwable failure) {  }
                })
                .setBulkActions(MAX_BULK_COUNT)
                .setBulkSize(new ByteSizeValue(MAX_BULK_SIZE, ByteSizeUnit.MB))
                .setFlushInterval(TimeValue.timeValueSeconds(MAX_COMMIT_INTERVAL))
                .setConcurrentRequests(MAX_CONCURRENT_REQUEST)
                .setBackoffPolicy(
                        BackoffPolicy.exponentialBackoff(
                                TimeValue.timeValueMillis(REJECT_EXCEPTION_RETRY_WAIT),
                                REJECT_EXCEPTION_RETRY_TIMES))
                .build();
    }

    /**
     * 加入索引请求到缓冲池
     *
     * @param indexRequest
     * @param fieldSet
     */
    public static void addIndexRequestToBulkProcessor(IndexRequest indexRequest,Set<String> fieldSet) {
        try {
            // 获取索引及类型信息
            System.out.println("index:"+indexRequest.index());
            System.out.println("type:"+indexRequest.type());

            // 尝试创建索引,并指定ik中文分词
            createMapping(indexRequest.index(),indexRequest.type(),fieldSet);

            // 更新数据
            bulkProcessor.add(indexRequest);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    /**
     * 创建mapping(feid("indexAnalyzer","ik")该字段分词IK索引 ;feid("searchAnalyzer","ik")该字段分词ik查询;具体分词插件请看IK分词插件说明)
     * @param index 索引名称;
     * @param mappingType 索引类型
     * @param fieldSet 列集合
     * @throws Exception
     */
    public static void createMapping(String index,String mappingType,Set<String> fieldSet)throws Exception{
        // 判断index是否存在,不存在则创建索引,并启用ik分词器
        if(client.admin().indices().exists(new IndicesExistsRequest(index)).actionGet().isExists()){
            System.out.println("index: '"+index+"' is exist!");
            new XContentFactory();
            XContentBuilder builder=XContentFactory.jsonBuilder()
                    .startObject()//注意不要加index和type
                    .startObject("properties")
                    .startObject("id").field("type", "string").field("store", "yes").endObject();
            for(String field : fieldSet){
                builder = builder.startObject(field).field("type", "string").field("store", "yes").field("analyzer", "ik").endObject();
            }
            builder = builder.endObject().endObject();

            PutMappingRequest mapping = Requests.putMappingRequest(index).type(mappingType).source(builder);
            client.admin().indices().putMapping(mapping).actionGet();

        } else {
            System.out.println("create index: '"+index+"'!");
            new XContentFactory();
            XContentBuilder builder=XContentFactory.jsonBuilder()
                    .startObject()//注意不要加index和type
                    .startObject("properties")
                    .startObject("id").field("type", "string").field("store", "yes").endObject();
            for(String field : fieldSet){
                builder = builder.startObject(field).field("type", "string").field("store", "yes").field("analyzer", "ik").endObject();
            }
            builder = builder.endObject().endObject();

            client.admin().indices().prepareCreate(index).addMapping(mappingType, builder).get();
        }
    }

    public static void test() {
        // on startup
        Client client = MyTransportClient.client;
        BulkProcessor bulkProcessor = BulkProcessor.builder(
                client,
                new BulkProcessor.Listener() {
                    @Override
                    public void beforeBulk(long executionId,
                                           BulkRequest request) {  }

                    @Override
                    public void afterBulk(long executionId,
                                          BulkRequest request,
                                          BulkResponse response) {  }

                    @Override
                    public void afterBulk(long executionId,
                                          BulkRequest request,
                                          Throwable failure) {  }
                })
                .setBulkActions(10000)
                .setBulkSize(new ByteSizeValue(1, ByteSizeUnit.GB))
                .setFlushInterval(TimeValue.timeValueSeconds(5))
                .setConcurrentRequests(1)
                .setBackoffPolicy(
                        BackoffPolicy.exponentialBackoff(TimeValue.timeValueMillis(100), 3))
                .build();

        Map<String, Object> json = new HashMap<String, Object>();
        json.put("field", "test");
        bulkProcessor.add(new IndexRequest("twitter", "tweet", "1111").source(json));
    }

    public static void main(String[] args) {
        test();
    }
}

3.DataSyncObserver类修改

@Override
    public void postPut(ObserverContext<RegionCoprocessorEnvironment> e, Put put, WALEdit edit, Durability durability) throws IOException {
        /**
         * 原方法调用ElasticSearchOperator,没有通过IK创建中文索引。
         */
//        try {
//            String indexId = new String(put.getRow());
//            Map<byte[], List<Cell>> familyMap = put.getFamilyCellMap();
////            NavigableMap<byte[], List<Cell>> familyMap = put.getFamilyCellMap();
//            Map<String, Object> json = new HashMap<String, Object>();
//            for (Map.Entry<byte[], List<Cell>> entry : familyMap.entrySet()) {
//                for (Cell cell : entry.getValue()) {
//                    String key = Bytes.toString(CellUtil.cloneQualifier(cell));
//                    String value = Bytes.toString(CellUtil.cloneValue(cell));
//                    json.put(key, value);
//                }
//            }
//            System.out.println();
//            ElasticSearchOperator.addUpdateBuilderToBulk(client.prepareUpdate(Config.indexName, Config.typeName, indexId).setDoc(json).setUpsert(json));
//            LOG.info("observer -- add new doc: " + indexId + " to type: " + Config.typeName);
//        } catch (Exception ex) {
//            LOG.error(ex);
//        }

        /**
         * 新方法调用ElasticSearchBulkProcessor,通过IK创建中文索引。
         */
        try {
            String indexId = new String(put.getRow());
            NavigableMap familyMap = put.getFamilyCellMap();
            HashSet set = new HashSet();
            HashMap json = new HashMap();
            Iterator mapIterator = familyMap.entrySet().iterator();

            while(mapIterator.hasNext()) {
                Map.Entry entry = (Map.Entry)mapIterator.next();
                Iterator valueIterator = ((List)entry.getValue()).iterator();

                while(valueIterator.hasNext()) {
                    Cell cell = (Cell)valueIterator.next();
                    String key = Bytes.toString(CellUtil.cloneQualifier(cell));
                    String value = Bytes.toString(CellUtil.cloneValue(cell));
                    json.put(key, value);
                    set.add(key);
                }
            }

            System.out.println();
            ElasticSearchBulkProcessor.addIndexRequestToBulkProcessor((new IndexRequest(Config.indexName, Config.typeName, indexId)).source(json), set);
            LOG.info("observer -- add new doc: " + indexId + " to type: " + Config.typeName);
        } catch (Exception ex) {
            LOG.error(ex);
        }
    }

4.测试

  • 代码打包
  • jar包上传到hdfs
  • 创建hbase表,并修改表属性关联observer
  • 测试put新数据
  • 查看es中数据
  • 中文分词测试

{"query":{"query_string":{"query":"拖鞋"}},"highlight":{"require_field_match":false,"explain":true,"fields":{"*":{}}}}

中文分词测试.jpg

5.程序代码整体和其余测试等操作可以查看另一篇文章

Sqoop导入HBase,并借助Coprocessor协处理器同步索引到ES

推荐阅读更多精彩内容