hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/HRegionPartitioner.java

   1 /**
   2  *
   3  * Licensed to the Apache Software Foundation (ASF) under one
   4  * or more contributor license agreements.  See the NOTICE file
   5  * distributed with this work for additional information
   6  * regarding copyright ownership.  The ASF licenses this file
   7  * to you under the Apache License, Version 2.0 (the
   8  * "License"); you may not use this file except in compliance
   9  * with the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19 package org.apache.hadoop.hbase.mapreduce;
  20
  21 import java.io.IOException;
  22 import org.apache.yetus.audience.InterfaceAudience;
  23 import org.slf4j.Logger;
  24 import org.slf4j.LoggerFactory;
  25 import org.apache.hadoop.conf.Configurable;
  26 import org.apache.hadoop.conf.Configuration;
  27 import org.apache.hadoop.hbase.HBaseConfiguration;
  28 import org.apache.hadoop.hbase.TableName;
  29 import org.apache.hadoop.hbase.client.Connection;
  30 import org.apache.hadoop.hbase.client.ConnectionFactory;
  31 import org.apache.hadoop.hbase.client.RegionLocator;
  32 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
  33 import org.apache.hadoop.hbase.mapred.TableOutputFormat;
  34 import org.apache.hadoop.hbase.util.Bytes;
  35 import org.apache.hadoop.mapreduce.Partitioner;
  36
  37 /**
  38  * This is used to partition the output keys into groups of keys.
  39  * Keys are grouped according to the regions that currently exist
  40  * so that each reducer fills a single region so load is distributed.
  41  *
  42  * <p>This class is not suitable as partitioner creating hfiles
  43  * for incremental bulk loads as region spread will likely change between time of
  44  * hfile creation and load time. See {@link org.apache.hadoop.hbase.tool.BulkLoadHFiles}
  45  * and <a href="http://hbase.apache.org/book.html#arch.bulk.load">Bulk Load</a>.</p>
  46  *
  47  * @param <KEY>  The type of the key.
  48  * @param <VALUE>  The type of the value.
  49  */
  50 @InterfaceAudience.Public
  51 public class HRegionPartitioner<KEY, VALUE>
  52 extends Partitioner<ImmutableBytesWritable, VALUE>
  53 implements Configurable {
  54
  55   private static final Logger LOG = LoggerFactory.getLogger(HRegionPartitioner.class);
  56   private Configuration conf = null;
  57   // Connection and locator are not cleaned up; they just die when partitioner is done.
  58   private Connection connection;
  59   private RegionLocator locator;
  60   private byte[][] startKeys;
  61
  62   /**
  63    * Gets the partition number for a given key (hence record) given the total
  64    * number of partitions i.e. number of reduce-tasks for the job.
  65    *
  66    * <p>Typically a hash function on a all or a subset of the key.</p>
  67    *
  68    * @param key  The key to be partitioned.
  69    * @param value  The entry value.
  70    * @param numPartitions  The total number of partitions.
  71    * @return The partition number for the <code>key</code>.
  72    * @see org.apache.hadoop.mapreduce.Partitioner#getPartition(
  73    *   java.lang.Object, java.lang.Object, int)
  74    */
  75   @Override
  76   public int getPartition(ImmutableBytesWritable key,
  77       VALUE value, int numPartitions) {
  78     byte[] region = null;
  79     // Only one region return 0
  80     if (this.startKeys.length == 1){
  81       return 0;
  82     }
  83     try {
  84       // Not sure if this is cached after a split so we could have problems
  85       // here if a region splits while mapping
  86       region = this.locator.getRegionLocation(key.get()).getRegion().getStartKey();
  87     } catch (IOException e) {
  88       LOG.error(e.toString(), e);
  89     }
  90     for (int i = 0; i < this.startKeys.length; i++){
  91       if (Bytes.compareTo(region, this.startKeys[i]) == 0 ){
  92         if (i >= numPartitions-1){
  93           // cover if we have less reduces then regions.
  94           return (Integer.toString(i).hashCode()
  95               & Integer.MAX_VALUE) % numPartitions;
  96         }
  97         return i;
  98       }
  99     }
 100     // if above fails to find start key that match we need to return something
 101     return 0;
 102   }
 103
 104   /**
 105    * Returns the current configuration.
 106    *
 107    * @return The current configuration.
 108    * @see org.apache.hadoop.conf.Configurable#getConf()
 109    */
 110   @Override
 111   public Configuration getConf() {
 112     return conf;
 113   }
 114
 115   /**
 116    * Sets the configuration. This is used to determine the start keys for the
 117    * given table.
 118    *
 119    * @param configuration  The configuration to set.
 120    * @see org.apache.hadoop.conf.Configurable#setConf(
 121    *   org.apache.hadoop.conf.Configuration)
 122    */
 123   @Override
 124   public void setConf(Configuration configuration) {
 125     this.conf = HBaseConfiguration.create(configuration);
 126     try {
 127       this.connection = ConnectionFactory.createConnection(HBaseConfiguration.create(conf));
 128       TableName tableName = TableName.valueOf(conf.get(TableOutputFormat.OUTPUT_TABLE));
 129       this.locator = this.connection.getRegionLocator(tableName);
 130     } catch (IOException e) {
 131       LOG.error(e.toString(), e);
 132     }
 133     try {
 134       this.startKeys = this.locator.getStartKeys();
 135     } catch (IOException e) {
 136       LOG.error(e.toString(), e);
 137     }
 138   }
 139 }