2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org
.apache
.hadoop
.hbase
.regionserver
;
20 import static org
.junit
.Assert
.assertTrue
;
22 import java
.io
.IOException
;
23 import java
.util
.NavigableMap
;
24 import java
.util
.TreeMap
;
25 import java
.util
.concurrent
.CountDownLatch
;
26 import java
.util
.concurrent
.TimeUnit
;
28 import org
.apache
.hadoop
.conf
.Configuration
;
29 import org
.apache
.hadoop
.fs
.FileSystem
;
30 import org
.apache
.hadoop
.fs
.Path
;
31 import org
.apache
.hadoop
.hbase
.CellScanner
;
32 import org
.apache
.hadoop
.hbase
.HBaseClassTestRule
;
33 import org
.apache
.hadoop
.hbase
.HBaseTestingUtil
;
34 import org
.apache
.hadoop
.hbase
.HConstants
;
35 import org
.apache
.hadoop
.hbase
.TableName
;
36 import org
.apache
.hadoop
.hbase
.client
.Durability
;
37 import org
.apache
.hadoop
.hbase
.client
.Put
;
38 import org
.apache
.hadoop
.hbase
.regionserver
.wal
.FSHLog
;
39 import org
.apache
.hadoop
.hbase
.testclassification
.MediumTests
;
40 import org
.apache
.hadoop
.hbase
.testclassification
.RegionServerTests
;
41 import org
.apache
.hadoop
.hbase
.util
.Bytes
;
42 import org
.apache
.hadoop
.hbase
.util
.EnvironmentEdgeManager
;
43 import org
.apache
.hadoop
.hbase
.util
.EnvironmentEdgeManagerTestHelper
;
44 import org
.apache
.hadoop
.hbase
.util
.Threads
;
45 import org
.apache
.hadoop
.hbase
.wal
.WAL
;
46 import org
.apache
.hadoop
.hbase
.wal
.WALEdit
;
47 import org
.apache
.hadoop
.hbase
.wal
.WALKeyImpl
;
48 import org
.apache
.hadoop
.hbase
.wal
.WALProvider
.Writer
;
49 import org
.junit
.After
;
50 import org
.junit
.Assert
;
51 import org
.junit
.Before
;
52 import org
.junit
.ClassRule
;
53 import org
.junit
.Rule
;
54 import org
.junit
.Test
;
55 import org
.junit
.experimental
.categories
.Category
;
56 import org
.junit
.rules
.TestName
;
57 import org
.mockito
.Mockito
;
58 import org
.slf4j
.Logger
;
59 import org
.slf4j
.LoggerFactory
;
61 import org
.apache
.hbase
.thirdparty
.com
.google
.common
.io
.Closeables
;
64 * Testing for lock up of FSHLog.
66 @Category({ RegionServerTests
.class, MediumTests
.class })
67 public class TestWALLockup
{
70 public static final HBaseClassTestRule CLASS_RULE
=
71 HBaseClassTestRule
.forClass(TestWALLockup
.class);
73 private static final Logger LOG
= LoggerFactory
.getLogger(TestWALLockup
.class);
76 public TestName name
= new TestName();
78 private static final String COLUMN_FAMILY
= "MyCF";
79 private static final byte [] COLUMN_FAMILY_BYTES
= Bytes
.toBytes(COLUMN_FAMILY
);
81 private static HBaseTestingUtil TEST_UTIL
= new HBaseTestingUtil();
82 private static Configuration CONF
;
86 protected TableName tableName
;
89 public void setup() throws IOException
{
90 CONF
= TEST_UTIL
.getConfiguration();
91 // Disable block cache.
92 CONF
.setFloat(HConstants
.HFILE_BLOCK_CACHE_SIZE_KEY
, 0f
);
93 dir
= TEST_UTIL
.getDataTestDir("TestHRegion").toString();
94 tableName
= TableName
.valueOf(name
.getMethodName());
98 public void tearDown() throws Exception
{
99 EnvironmentEdgeManagerTestHelper
.reset();
100 LOG
.info("Cleaning test directory: " + TEST_UTIL
.getDataTestDir());
101 TEST_UTIL
.cleanupTestDir();
104 private String
getName() {
105 return name
.getMethodName();
108 // A WAL that we can have throw exceptions when a flag is set.
109 private static final class DodgyFSLog
extends FSHLog
{
110 // Set this when want the WAL to start throwing exceptions.
111 volatile boolean throwException
= false;
113 // Latch to hold up processing until after another operation has had time to run.
114 CountDownLatch latch
= new CountDownLatch(1);
116 public DodgyFSLog(FileSystem fs
, Path root
, String logDir
, Configuration conf
)
118 super(fs
, root
, logDir
, conf
);
122 protected void afterCreatingZigZagLatch() {
123 // If throwException set, then append will throw an exception causing the WAL to be
124 // rolled. We'll come in here. Hold up processing until a sync can get in before
125 // the zigzag has time to complete its setup and get its own sync in. This is what causes
126 // the lock up we've seen in production.
127 if (throwException
) {
130 // So, timing can have it that the test can run and the bad flush below happens
131 // before we get here. In this case, we'll be stuck waiting on this latch but there
132 // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
133 // because all WALs have rolled. In this case, just give up on test.
134 if (!this.latch
.await(5, TimeUnit
.SECONDS
)) {
135 LOG
.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
137 } catch (InterruptedException e
) {
143 protected void beforeWaitOnSafePoint() {
144 if (throwException
) {
145 LOG
.info("COUNTDOWN");
146 // Don't countdown latch until someone waiting on it otherwise, the above
147 // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
148 // be stuck; test won't go down
149 while (this.latch
.getCount() <= 0)
151 this.latch
.countDown();
156 protected Writer
createWriterInstance(Path path
) throws IOException
{
157 final Writer w
= super.createWriterInstance(path
);
158 return new Writer() {
160 public void close() throws IOException
{
165 public void sync(boolean forceSync
) throws IOException
{
166 if (throwException
) {
167 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
173 public void append(Entry entry
) throws IOException
{
174 if (throwException
) {
175 throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
181 public long getLength() {
182 return w
.getLength();
186 public long getSyncedLength() {
187 return w
.getSyncedLength();
194 * Reproduce locking up that happens when we get an inopportune sync during setup for
195 * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
197 * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
198 * set up a dodgy WAL that will throw an exception when we go to append to it.
201 public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException
{
202 // Mocked up server and regionserver services. Needed below.
203 RegionServerServices services
= Mockito
.mock(RegionServerServices
.class);
204 Mockito
.when(services
.getConfiguration()).thenReturn(CONF
);
205 Mockito
.when(services
.isStopped()).thenReturn(false);
206 Mockito
.when(services
.isAborted()).thenReturn(false);
208 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
209 FileSystem fs
= FileSystem
.get(CONF
);
210 Path rootDir
= new Path(dir
+ getName());
211 DodgyFSLog dodgyWAL
= new DodgyFSLog(fs
, rootDir
, getName(), CONF
);
213 Path originalWAL
= dodgyWAL
.getCurrentFileName();
214 // I need a log roller running.
215 LogRoller logRoller
= new LogRoller(services
);
216 logRoller
.addWAL(dodgyWAL
);
217 // There is no 'stop' once a logRoller is running.. it just dies.
219 // Now get a region and start adding in edits.
220 final HRegion region
= initHRegion(tableName
, null, null, CONF
, dodgyWAL
);
221 byte [] bytes
= Bytes
.toBytes(getName());
222 NavigableMap
<byte[], Integer
> scopes
= new TreeMap
<>(
223 Bytes
.BYTES_COMPARATOR
);
224 scopes
.put(COLUMN_FAMILY_BYTES
, 0);
225 MultiVersionConcurrencyControl mvcc
= new MultiVersionConcurrencyControl();
227 // First get something into memstore. Make a Put and then pull the Cell out of it. Will
228 // manage append and sync carefully in below to manufacture hang. We keep adding same
229 // edit. WAL subsystem doesn't care.
230 Put put
= new Put(bytes
);
231 put
.addColumn(COLUMN_FAMILY_BYTES
, Bytes
.toBytes("1"), bytes
);
232 WALKeyImpl key
= new WALKeyImpl(region
.getRegionInfo().getEncodedNameAsBytes(),
233 TableName
.META_TABLE_NAME
, EnvironmentEdgeManager
.currentTime(), mvcc
, scopes
);
234 WALEdit edit
= new WALEdit();
235 CellScanner CellScanner
= put
.cellScanner();
236 assertTrue(CellScanner
.advance());
237 edit
.add(CellScanner
.current());
238 // Put something in memstore and out in the WAL. Do a big number of appends so we push
239 // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
240 for (int i
= 0; i
< 1000; i
++) {
243 // Set it so we start throwing exceptions.
244 LOG
.info("SET throwing of exception on append");
245 dodgyWAL
.throwException
= true;
246 // This append provokes a WAL roll request
247 dodgyWAL
.appendData(region
.getRegionInfo(), key
, edit
);
248 boolean exception
= false;
250 dodgyWAL
.sync(false);
251 } catch (Exception e
) {
254 assertTrue("Did not get sync exception", exception
);
256 // Get a memstore flush going too so we have same hung profile as up in the issue over
257 // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
258 // by the zigzaglatch waiting on syncs to come home.
259 Thread t
= new Thread ("Flusher") {
263 if (region
.getMemStoreDataSize() <= 0) {
264 throw new IOException("memstore size=" + region
.getMemStoreDataSize());
267 } catch (IOException e
) {
268 // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
269 // when roll completes.
270 LOG
.info("In flush", e
);
278 while (dodgyWAL
.latch
.getCount() > 0) {
281 // Now assert I got a new WAL file put in place even though loads of errors above.
282 assertTrue(originalWAL
!= dodgyWAL
.getCurrentFileName());
283 // Can I append to it?
284 dodgyWAL
.throwException
= false;
287 } catch (Exception e
) {
288 LOG
.info("In the put", e
);
291 // To stop logRoller, its server has to say it is stopped.
292 Mockito
.when(services
.isStopped()).thenReturn(true);
293 Closeables
.close(logRoller
, true);
295 if (region
!= null) {
298 if (dodgyWAL
!= null) {
301 } catch (Exception e
) {
302 LOG
.info("On way out", e
);
309 * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in
310 * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317.
313 public void testRingBufferEventHandlerStuckWhenSyncFailed()
314 throws IOException
, InterruptedException
{
316 // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down
317 class DodgyFSLog
extends FSHLog
{
319 private volatile boolean zigZagCreated
= false;
321 public DodgyFSLog(FileSystem fs
, Path root
, String logDir
, Configuration conf
)
323 super(fs
, root
, logDir
, conf
);
327 protected void afterCreatingZigZagLatch() {
328 zigZagCreated
= true;
329 // Sleep a while to wait for RingBufferEventHandler to get stuck first.
332 } catch (InterruptedException ignore
) {
337 protected long getSequenceOnRingBuffer() {
338 return super.getSequenceOnRingBuffer();
341 protected void publishSyncOnRingBufferAndBlock(long sequence
) {
343 super.blockOnSync(super.publishSyncOnRingBuffer(sequence
, false));
344 Assert
.fail("Expect an IOException here.");
345 } catch (IOException ignore
) {
346 // Here, we will get an IOException.
351 protected Writer
createWriterInstance(Path path
) throws IOException
{
352 final Writer w
= super.createWriterInstance(path
);
353 return new Writer() {
355 public void close() throws IOException
{
360 public void sync(boolean forceSync
) throws IOException
{
361 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
365 public void append(Entry entry
) throws IOException
{
370 public long getLength() {
371 return w
.getLength();
375 public long getSyncedLength() {
376 return w
.getSyncedLength();
382 // Mocked up server and regionserver services. Needed below.
383 RegionServerServices services
= Mockito
.mock(RegionServerServices
.class);
384 Mockito
.when(services
.getConfiguration()).thenReturn(CONF
);
385 Mockito
.when(services
.isStopped()).thenReturn(false);
386 Mockito
.when(services
.isAborted()).thenReturn(false);
388 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
389 FileSystem fs
= FileSystem
.get(CONF
);
390 Path rootDir
= new Path(dir
+ getName());
391 final DodgyFSLog dodgyWAL
= new DodgyFSLog(fs
, rootDir
, getName(), CONF
);
392 // I need a log roller running.
393 LogRoller logRoller
= new LogRoller(services
);
394 logRoller
.addWAL(dodgyWAL
);
395 // There is no 'stop' once a logRoller is running.. it just dies.
399 final long seqForSync
= dodgyWAL
.getSequenceOnRingBuffer();
401 // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch
403 // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event,
404 // as long as HBASE-14317 hasn't be fixed.
405 LOG
.info("Trigger log roll for creating a ZigZagLatch.");
406 logRoller
.requestRollAll();
408 while (!dodgyWAL
.zigZagCreated
) {
412 // Send a sync event for RingBufferEventHandler,
413 // and it gets blocked in RingBufferEventHandler.attainSafePoint
414 LOG
.info("Send sync for RingBufferEventHandler");
415 Thread syncThread
= new Thread() {
418 dodgyWAL
.publishSyncOnRingBufferAndBlock(seqForSync
);
421 // Sync in another thread to avoid reset SyncFuture again.
426 LOG
.info("Call sync for testing whether RingBufferEventHandler is hanging.");
427 dodgyWAL
.sync(false); // Should not get a hang here, otherwise we will see timeout in this test.
428 Assert
.fail("Expect an IOException here.");
429 } catch (IOException ignore
) {
433 // To stop logRoller, its server has to say it is stopped.
434 Mockito
.when(services
.isStopped()).thenReturn(true);
435 if (logRoller
!= null) {
438 if (dodgyWAL
!= null) {
445 * @return A region on which you must call {@link HBaseTestingUtil#closeRegionAndWAL(HRegion)}
448 private static HRegion
initHRegion(TableName tableName
, byte[] startKey
, byte[] stopKey
,
449 Configuration conf
, WAL wal
) throws IOException
{
450 ChunkCreator
.initialize(MemStoreLAB
.CHUNK_SIZE_DEFAULT
, false, 0, 0,
451 0, null, MemStoreLAB
.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT
);
452 return TEST_UTIL
.createLocalHRegion(tableName
, startKey
, stopKey
, conf
, false,
453 Durability
.SYNC_WAL
, wal
, COLUMN_FAMILY_BYTES
);