HBASE-26567 Remove IndexType from ChunkCreator (#3947)
[hbase.git] / hbase-server / src / test / java / org / apache / hadoop / hbase / regionserver / TestWALLockup.java
blob93f06fa318572945450831093b555aceaee76560
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org.apache.hadoop.hbase.regionserver;
20 import static org.junit.Assert.assertTrue;
22 import java.io.IOException;
23 import java.util.NavigableMap;
24 import java.util.TreeMap;
25 import java.util.concurrent.CountDownLatch;
26 import java.util.concurrent.TimeUnit;
28 import org.apache.hadoop.conf.Configuration;
29 import org.apache.hadoop.fs.FileSystem;
30 import org.apache.hadoop.fs.Path;
31 import org.apache.hadoop.hbase.CellScanner;
32 import org.apache.hadoop.hbase.HBaseClassTestRule;
33 import org.apache.hadoop.hbase.HBaseTestingUtil;
34 import org.apache.hadoop.hbase.HConstants;
35 import org.apache.hadoop.hbase.TableName;
36 import org.apache.hadoop.hbase.client.Durability;
37 import org.apache.hadoop.hbase.client.Put;
38 import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
39 import org.apache.hadoop.hbase.testclassification.MediumTests;
40 import org.apache.hadoop.hbase.testclassification.RegionServerTests;
41 import org.apache.hadoop.hbase.util.Bytes;
42 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
43 import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
44 import org.apache.hadoop.hbase.util.Threads;
45 import org.apache.hadoop.hbase.wal.WAL;
46 import org.apache.hadoop.hbase.wal.WALEdit;
47 import org.apache.hadoop.hbase.wal.WALKeyImpl;
48 import org.apache.hadoop.hbase.wal.WALProvider.Writer;
49 import org.junit.After;
50 import org.junit.Assert;
51 import org.junit.Before;
52 import org.junit.ClassRule;
53 import org.junit.Rule;
54 import org.junit.Test;
55 import org.junit.experimental.categories.Category;
56 import org.junit.rules.TestName;
57 import org.mockito.Mockito;
58 import org.slf4j.Logger;
59 import org.slf4j.LoggerFactory;
61 import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
63 /**
64 * Testing for lock up of FSHLog.
66 @Category({ RegionServerTests.class, MediumTests.class })
67 public class TestWALLockup {
69 @ClassRule
70 public static final HBaseClassTestRule CLASS_RULE =
71 HBaseClassTestRule.forClass(TestWALLockup.class);
73 private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class);
75 @Rule
76 public TestName name = new TestName();
78 private static final String COLUMN_FAMILY = "MyCF";
79 private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
81 private static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
82 private static Configuration CONF ;
83 private String dir;
85 // Test names
86 protected TableName tableName;
88 @Before
89 public void setup() throws IOException {
90 CONF = TEST_UTIL.getConfiguration();
91 // Disable block cache.
92 CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
93 dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
94 tableName = TableName.valueOf(name.getMethodName());
97 @After
98 public void tearDown() throws Exception {
99 EnvironmentEdgeManagerTestHelper.reset();
100 LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
101 TEST_UTIL.cleanupTestDir();
104 private String getName() {
105 return name.getMethodName();
108 // A WAL that we can have throw exceptions when a flag is set.
109 private static final class DodgyFSLog extends FSHLog {
110 // Set this when want the WAL to start throwing exceptions.
111 volatile boolean throwException = false;
113 // Latch to hold up processing until after another operation has had time to run.
114 CountDownLatch latch = new CountDownLatch(1);
116 public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
117 throws IOException {
118 super(fs, root, logDir, conf);
121 @Override
122 protected void afterCreatingZigZagLatch() {
123 // If throwException set, then append will throw an exception causing the WAL to be
124 // rolled. We'll come in here. Hold up processing until a sync can get in before
125 // the zigzag has time to complete its setup and get its own sync in. This is what causes
126 // the lock up we've seen in production.
127 if (throwException) {
128 try {
129 LOG.info("LATCHED");
130 // So, timing can have it that the test can run and the bad flush below happens
131 // before we get here. In this case, we'll be stuck waiting on this latch but there
132 // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
133 // because all WALs have rolled. In this case, just give up on test.
134 if (!this.latch.await(5, TimeUnit.SECONDS)) {
135 LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
137 } catch (InterruptedException e) {
142 @Override
143 protected void beforeWaitOnSafePoint() {
144 if (throwException) {
145 LOG.info("COUNTDOWN");
146 // Don't countdown latch until someone waiting on it otherwise, the above
147 // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
148 // be stuck; test won't go down
149 while (this.latch.getCount() <= 0)
150 Threads.sleep(1);
151 this.latch.countDown();
155 @Override
156 protected Writer createWriterInstance(Path path) throws IOException {
157 final Writer w = super.createWriterInstance(path);
158 return new Writer() {
159 @Override
160 public void close() throws IOException {
161 w.close();
164 @Override
165 public void sync(boolean forceSync) throws IOException {
166 if (throwException) {
167 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
169 w.sync(forceSync);
172 @Override
173 public void append(Entry entry) throws IOException {
174 if (throwException) {
175 throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
177 w.append(entry);
180 @Override
181 public long getLength() {
182 return w.getLength();
185 @Override
186 public long getSyncedLength() {
187 return w.getSyncedLength();
194 * Reproduce locking up that happens when we get an inopportune sync during setup for
195 * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
196 * it is locked up.
197 * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
198 * set up a dodgy WAL that will throw an exception when we go to append to it.
200 @Test
201 public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
202 // Mocked up server and regionserver services. Needed below.
203 RegionServerServices services = Mockito.mock(RegionServerServices.class);
204 Mockito.when(services.getConfiguration()).thenReturn(CONF);
205 Mockito.when(services.isStopped()).thenReturn(false);
206 Mockito.when(services.isAborted()).thenReturn(false);
208 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
209 FileSystem fs = FileSystem.get(CONF);
210 Path rootDir = new Path(dir + getName());
211 DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
212 dodgyWAL.init();
213 Path originalWAL = dodgyWAL.getCurrentFileName();
214 // I need a log roller running.
215 LogRoller logRoller = new LogRoller(services);
216 logRoller.addWAL(dodgyWAL);
217 // There is no 'stop' once a logRoller is running.. it just dies.
218 logRoller.start();
219 // Now get a region and start adding in edits.
220 final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL);
221 byte [] bytes = Bytes.toBytes(getName());
222 NavigableMap<byte[], Integer> scopes = new TreeMap<>(
223 Bytes.BYTES_COMPARATOR);
224 scopes.put(COLUMN_FAMILY_BYTES, 0);
225 MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
226 try {
227 // First get something into memstore. Make a Put and then pull the Cell out of it. Will
228 // manage append and sync carefully in below to manufacture hang. We keep adding same
229 // edit. WAL subsystem doesn't care.
230 Put put = new Put(bytes);
231 put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
232 WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(),
233 TableName.META_TABLE_NAME, EnvironmentEdgeManager.currentTime(), mvcc, scopes);
234 WALEdit edit = new WALEdit();
235 CellScanner CellScanner = put.cellScanner();
236 assertTrue(CellScanner.advance());
237 edit.add(CellScanner.current());
238 // Put something in memstore and out in the WAL. Do a big number of appends so we push
239 // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
240 for (int i = 0; i < 1000; i++) {
241 region.put(put);
243 // Set it so we start throwing exceptions.
244 LOG.info("SET throwing of exception on append");
245 dodgyWAL.throwException = true;
246 // This append provokes a WAL roll request
247 dodgyWAL.appendData(region.getRegionInfo(), key, edit);
248 boolean exception = false;
249 try {
250 dodgyWAL.sync(false);
251 } catch (Exception e) {
252 exception = true;
254 assertTrue("Did not get sync exception", exception);
256 // Get a memstore flush going too so we have same hung profile as up in the issue over
257 // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
258 // by the zigzaglatch waiting on syncs to come home.
259 Thread t = new Thread ("Flusher") {
260 @Override
261 public void run() {
262 try {
263 if (region.getMemStoreDataSize() <= 0) {
264 throw new IOException("memstore size=" + region.getMemStoreDataSize());
266 region.flush(false);
267 } catch (IOException e) {
268 // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
269 // when roll completes.
270 LOG.info("In flush", e);
272 LOG.info("Exiting");
275 t.setDaemon(true);
276 t.start();
277 // Wait until
278 while (dodgyWAL.latch.getCount() > 0) {
279 Threads.sleep(1);
281 // Now assert I got a new WAL file put in place even though loads of errors above.
282 assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
283 // Can I append to it?
284 dodgyWAL.throwException = false;
285 try {
286 region.put(put);
287 } catch (Exception e) {
288 LOG.info("In the put", e);
290 } finally {
291 // To stop logRoller, its server has to say it is stopped.
292 Mockito.when(services.isStopped()).thenReturn(true);
293 Closeables.close(logRoller, true);
294 try {
295 if (region != null) {
296 region.close();
298 if (dodgyWAL != null) {
299 dodgyWAL.close();
301 } catch (Exception e) {
302 LOG.info("On way out", e);
309 * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in
310 * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317.
312 @Test
313 public void testRingBufferEventHandlerStuckWhenSyncFailed()
314 throws IOException, InterruptedException {
316 // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down
317 class DodgyFSLog extends FSHLog {
319 private volatile boolean zigZagCreated = false;
321 public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
322 throws IOException {
323 super(fs, root, logDir, conf);
326 @Override
327 protected void afterCreatingZigZagLatch() {
328 zigZagCreated = true;
329 // Sleep a while to wait for RingBufferEventHandler to get stuck first.
330 try {
331 Thread.sleep(3000);
332 } catch (InterruptedException ignore) {
336 @Override
337 protected long getSequenceOnRingBuffer() {
338 return super.getSequenceOnRingBuffer();
341 protected void publishSyncOnRingBufferAndBlock(long sequence) {
342 try {
343 super.blockOnSync(super.publishSyncOnRingBuffer(sequence, false));
344 Assert.fail("Expect an IOException here.");
345 } catch (IOException ignore) {
346 // Here, we will get an IOException.
350 @Override
351 protected Writer createWriterInstance(Path path) throws IOException {
352 final Writer w = super.createWriterInstance(path);
353 return new Writer() {
354 @Override
355 public void close() throws IOException {
356 w.close();
359 @Override
360 public void sync(boolean forceSync) throws IOException {
361 throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
364 @Override
365 public void append(Entry entry) throws IOException {
366 w.append(entry);
369 @Override
370 public long getLength() {
371 return w.getLength();
374 @Override
375 public long getSyncedLength() {
376 return w.getSyncedLength();
382 // Mocked up server and regionserver services. Needed below.
383 RegionServerServices services = Mockito.mock(RegionServerServices.class);
384 Mockito.when(services.getConfiguration()).thenReturn(CONF);
385 Mockito.when(services.isStopped()).thenReturn(false);
386 Mockito.when(services.isAborted()).thenReturn(false);
388 // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
389 FileSystem fs = FileSystem.get(CONF);
390 Path rootDir = new Path(dir + getName());
391 final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
392 // I need a log roller running.
393 LogRoller logRoller = new LogRoller(services);
394 logRoller.addWAL(dodgyWAL);
395 // There is no 'stop' once a logRoller is running.. it just dies.
396 logRoller.start();
398 try {
399 final long seqForSync = dodgyWAL.getSequenceOnRingBuffer();
401 // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch
402 // in LogRoller.
403 // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event,
404 // as long as HBASE-14317 hasn't be fixed.
405 LOG.info("Trigger log roll for creating a ZigZagLatch.");
406 logRoller.requestRollAll();
408 while (!dodgyWAL.zigZagCreated) {
409 Thread.sleep(10);
412 // Send a sync event for RingBufferEventHandler,
413 // and it gets blocked in RingBufferEventHandler.attainSafePoint
414 LOG.info("Send sync for RingBufferEventHandler");
415 Thread syncThread = new Thread() {
416 @Override
417 public void run() {
418 dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync);
421 // Sync in another thread to avoid reset SyncFuture again.
422 syncThread.start();
423 syncThread.join();
425 try {
426 LOG.info("Call sync for testing whether RingBufferEventHandler is hanging.");
427 dodgyWAL.sync(false); // Should not get a hang here, otherwise we will see timeout in this test.
428 Assert.fail("Expect an IOException here.");
429 } catch (IOException ignore) {
432 } finally {
433 // To stop logRoller, its server has to say it is stopped.
434 Mockito.when(services.isStopped()).thenReturn(true);
435 if (logRoller != null) {
436 logRoller.close();
438 if (dodgyWAL != null) {
439 dodgyWAL.close();
445 * @return A region on which you must call {@link HBaseTestingUtil#closeRegionAndWAL(HRegion)}
446 * when done.
448 private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey,
449 Configuration conf, WAL wal) throws IOException {
450 ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0,
451 0, null, MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT);
452 return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false,
453 Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);