From c503f435045aeb27ea33ec9915da7dfa3ea79712 Mon Sep 17 00:00:00 2001 From: Dan Cech Date: Thu, 1 Feb 2018 16:52:13 -0500 Subject: [PATCH 1/5] support for storing tagged series in hashed filenames --- conf/carbon.conf.example | 3 ++ lib/carbon/conf.py | 1 + lib/carbon/database.py | 23 +++++++++----- lib/carbon/tests/test_database.py | 50 ++++++++++++++++++++++++------- lib/carbon/util.py | 13 ++++++-- 5 files changed, 71 insertions(+), 19 deletions(-) diff --git a/conf/carbon.conf.example b/conf/carbon.conf.example index 456233be9..1a820b78f 100644 --- a/conf/carbon.conf.example +++ b/conf/carbon.conf.example @@ -312,6 +312,9 @@ WHISPER_FALLOCATE_CREATE = True # an update to the tag index, the default setting is once every 100 updates # TAG_UPDATE_INTERVAL = 100 +# Tag hash filenames, this specifies whether tagged metric filenames should use the hash of the metric name +# or a human-readable name, using hashed names avoids issues with path length when using a large number of tags +# TAG_HASH_FILENAMES = True # Tag batch size, this specifies the maximum number of series to be sent to graphite-web in a single batch # TAG_BATCH_SIZE = 100 diff --git a/lib/carbon/conf.py b/lib/carbon/conf.py index 92062dd4b..560ec66c9 100644 --- a/lib/carbon/conf.py +++ b/lib/carbon/conf.py @@ -81,6 +81,7 @@ TAG_UPDATE_INTERVAL=100, TAG_BATCH_SIZE=100, TAG_QUEUE_SIZE=10000, + TAG_HASH_FILENAMES=True, ENABLE_MANHOLE=False, MANHOLE_INTERFACE='127.0.0.1', MANHOLE_PORT=7222, diff --git a/lib/carbon/database.py b/lib/carbon/database.py index 839ad8753..efdecd062 100644 --- a/lib/carbon/database.py +++ b/lib/carbon/database.py @@ -88,6 +88,7 @@ def __init__(self, settings): super(WhisperDatabase, self).__init__(settings) self.data_dir = settings.LOCAL_DATA_DIR + self.tag_hash_filenames = settings.TAG_HASH_FILENAMES self.sparse_create = settings.WHISPER_SPARSE_CREATE self.fallocate_create = settings.WHISPER_FALLOCATE_CREATE if settings.WHISPER_AUTOFLUSH: @@ -152,7 +153,10 @@ def setMetadata(self, metric, key, value): return whisper.setAggregationMethod(wsp_path, value) def getFilesystemPath(self, metric): - return join(self.data_dir, TaggedSeries.encode(metric, sep) + '.wsp') + return join( + self.data_dir, + TaggedSeries.encode(metric, sep, hash_only=self.tag_hash_filenames) + '.wsp' + ) def validateArchiveList(self, archiveList): try: @@ -174,6 +178,7 @@ def __init__(self, settings): super(CeresDatabase, self).__init__(settings) self.data_dir = settings.LOCAL_DATA_DIR + self.tag_hash_filenames = settings.TAG_HASH_FILENAMES ceres.setDefaultNodeCachingBehavior(settings.CERES_NODE_CACHING_BEHAVIOR) ceres.setDefaultSliceCachingBehavior(settings.CERES_SLICE_CACHING_BEHAVIOR) ceres.MAX_SLICE_GAP = int(settings.CERES_MAX_SLICE_GAP) @@ -187,26 +192,30 @@ def __init__(self, settings): self.tree = ceres.CeresTree(self.data_dir) + def encode(self, metric): + return TaggedSeries.encode(metric, hash_only=self.tag_hash_filenames) + def write(self, metric, datapoints): - self.tree.store(TaggedSeries.encode(metric), datapoints) + self.tree.store(self.encode(metric), datapoints) def exists(self, metric): - return self.tree.hasNode(TaggedSeries.encode(metric)) + return self.tree.hasNode(self.encode(metric)) def create(self, metric, retentions, xfilesfactor, aggregation_method): - self.tree.createNode(TaggedSeries.encode(metric), retentions=retentions, + self.tree.createNode(self.encode(metric), + retentions=retentions, timeStep=retentions[0][0], xFilesFactor=xfilesfactor, aggregationMethod=aggregation_method) def getMetadata(self, metric, key): - return self.tree.getNode(TaggedSeries.encode(metric)).readMetadata()[key] + return self.tree.getNode(self.encode(metric)).readMetadata()[key] def setMetadata(self, metric, key, value): - node = self.tree.getNode(TaggedSeries.encode(metric)) + node = self.tree.getNode(self.encode(metric)) metadata = node.readMetadata() metadata[key] = value node.writeMetadata(metadata) def getFilesystemPath(self, metric): - return self.tree.getFilesystemPath(TaggedSeries.encode(metric)) + return self.tree.getFilesystemPath(self.encode(metric)) diff --git a/lib/carbon/tests/test_database.py b/lib/carbon/tests/test_database.py index 82b4c4757..be96f09ab 100644 --- a/lib/carbon/tests/test_database.py +++ b/lib/carbon/tests/test_database.py @@ -11,40 +11,70 @@ class WhisperDatabaseTest(TestCase): def setUp(self): self._sep_patch = patch.object(os.path, 'sep', "/") self._sep_patch.start() - settings = TestSettings() - settings['LOCAL_DATA_DIR'] = '/tmp/' - self.database = WhisperDatabase(settings) def tearDown(self): self._sep_patch.stop() def test_getFilesystemPath(self): - result = self.database.getFilesystemPath('stats.example.counts') + settings = TestSettings() + settings['LOCAL_DATA_DIR'] = '/tmp/' + database = WhisperDatabase(settings) + result = database.getFilesystemPath('stats.example.counts') self.assertEqual(result, '/tmp/stats/example/counts.wsp') # nosec def test_getTaggedFilesystemPath(self): - result = self.database.getFilesystemPath('stats.example.counts;tag1=value1') + settings = TestSettings() + settings['LOCAL_DATA_DIR'] = '/tmp/' + settings['TAG_HASH_FILENAMES'] = False + database = WhisperDatabase(settings) + result = database.getFilesystemPath('stats.example.counts;tag1=value1') self.assertEqual( result, '/tmp/_tagged/872/252/stats_DOT_example_DOT_counts;tag1=value1.wsp') # nosec + def test_getTaggedFilesystemPathHashed(self): + settings = TestSettings() + settings['LOCAL_DATA_DIR'] = '/tmp/' + settings['TAG_HASH_FILENAMES'] = True + database = WhisperDatabase(settings) + result = database.getFilesystemPath('stats.example.counts;tag1=value1') + self.assertEqual( + result, + '/tmp/_tagged/872/252/' + + '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0.wsp') # nosec + class CeresDatabaseTest(TestCase): def setUp(self): self._sep_patch = patch.object(os.path, 'sep', "/") self._sep_patch.start() - settings = TestSettings() - settings['LOCAL_DATA_DIR'] = '/tmp/' - self.database = CeresDatabase(settings) def tearDown(self): self._sep_patch.stop() def test_getFilesystemPath(self): - result = self.database.getFilesystemPath('stats.example.counts') + settings = TestSettings() + settings['LOCAL_DATA_DIR'] = '/tmp/' + database = CeresDatabase(settings) + result = database.getFilesystemPath('stats.example.counts') self.assertEqual(result, '/tmp/stats/example/counts') # nosec def test_getTaggedFilesystemPath(self): - result = self.database.getFilesystemPath('stats.example.counts;tag1=value1') + settings = TestSettings() + settings['LOCAL_DATA_DIR'] = '/tmp/' + settings['TAG_HASH_FILENAMES'] = False + database = CeresDatabase(settings) + result = database.getFilesystemPath('stats.example.counts;tag1=value1') self.assertEqual( result, '/tmp/_tagged/872/252/stats_DOT_example_DOT_counts;tag1=value1') # nosec + + def test_getTaggedFilesystemPathHashed(self): + settings = TestSettings() + settings['LOCAL_DATA_DIR'] = '/tmp/' + settings['TAG_HASH_FILENAMES'] = True + database = CeresDatabase(settings) + result = database.getFilesystemPath('stats.example.counts;tag1=value1') + self.assertEqual( + result, + '/tmp/_tagged/872/252/' + + '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0') # nosec diff --git a/lib/carbon/util.py b/lib/carbon/util.py index 4795ba7f9..36eabb03d 100644 --- a/lib/carbon/util.py +++ b/lib/carbon/util.py @@ -390,7 +390,7 @@ def format(tags): ])) @staticmethod - def encode(metric, sep='.'): + def encode(metric, sep='.', hash_only=False): """ Helper function to encode tagged series for storage in whisper etc @@ -404,6 +404,10 @@ def encode(metric, sep='.'): each carbon database and graphite-web finder is responsible for handling its own encoding so that different backends can create their own schemes if desired. + The hash_only parameter can be set to True to use the hash as the filename instead of a + human-readable name. This avoids issues with filename length restrictions, at the expense of + being unable to decode the filename and determine the original metric name. + A concrete example: .. code-block:: none @@ -418,7 +422,12 @@ def encode(metric, sep='.'): """ if ';' in metric: metric_hash = sha256(metric.encode('utf8')).hexdigest() - return sep.join(['_tagged', metric_hash[0:3], metric_hash[3:6], metric.replace('.', '_DOT_')]) + return sep.join([ + '_tagged', + metric_hash[0:3], + metric_hash[3:6], + metric_hash if hash_only else metric.replace('.', '_DOT_') + ]) # metric isn't tagged, just replace dots with the separator and trim any leading separator return metric.replace('.', sep).lstrip(sep) From 798f65c07ac5a598aa4ef3cbdc04dd23f762af87 Mon Sep 17 00:00:00 2001 From: Dan Cech Date: Thu, 1 Feb 2018 17:05:50 -0500 Subject: [PATCH 2/5] codacy fix --- lib/carbon/tests/test_database.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/carbon/tests/test_database.py b/lib/carbon/tests/test_database.py index be96f09ab..0c97656e1 100644 --- a/lib/carbon/tests/test_database.py +++ b/lib/carbon/tests/test_database.py @@ -39,8 +39,8 @@ def test_getTaggedFilesystemPathHashed(self): result = database.getFilesystemPath('stats.example.counts;tag1=value1') self.assertEqual( result, - '/tmp/_tagged/872/252/' + - '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0.wsp') # nosec + '/tmp/_tagged/872/252/' + # nosec + '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0.wsp') class CeresDatabaseTest(TestCase): @@ -76,5 +76,5 @@ def test_getTaggedFilesystemPathHashed(self): result = database.getFilesystemPath('stats.example.counts;tag1=value1') self.assertEqual( result, - '/tmp/_tagged/872/252/' + - '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0') # nosec + '/tmp/_tagged/872/252/' + # nosec + '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0') From fa330285749ac87a82618d07aee8c9b878eebc46 Mon Sep 17 00:00:00 2001 From: Dan Cech Date: Wed, 14 Feb 2018 12:00:20 -0500 Subject: [PATCH 3/5] move any existing non-hashed files to hashed names if needed --- lib/carbon/database.py | 32 ++++++-- lib/carbon/tests/test_database.py | 120 +++++++++++++++++++++++++++++- 2 files changed, 140 insertions(+), 12 deletions(-) diff --git a/lib/carbon/database.py b/lib/carbon/database.py index efdecd062..100bf832d 100644 --- a/lib/carbon/database.py +++ b/lib/carbon/database.py @@ -124,7 +124,13 @@ def write(self, metric, datapoints): whisper.update_many(path, datapoints) def exists(self, metric): - return exists(self.getFilesystemPath(metric)) + if exists(self.getFilesystemPath(metric)): + return True + # if we're using hashed filenames and a non-hashed file exists then move it to the new name + if self.tag_hash_filenames and exists(self.getFilesystemPath(metric, False)): + os.rename(self.getFilesystemPath(metric, False), self.getFilesystemPath(metric)) + return True + return False def create(self, metric, retentions, xfilesfactor, aggregation_method): path = self.getFilesystemPath(metric) @@ -152,10 +158,12 @@ def setMetadata(self, metric, key, value): wsp_path = self.getFilesystemPath(metric) return whisper.setAggregationMethod(wsp_path, value) - def getFilesystemPath(self, metric): + def getFilesystemPath(self, metric, tag_hash_filenames=None): + if tag_hash_filenames is None: + tag_hash_filenames = self.tag_hash_filenames return join( self.data_dir, - TaggedSeries.encode(metric, sep, hash_only=self.tag_hash_filenames) + '.wsp' + TaggedSeries.encode(metric, sep, hash_only=tag_hash_filenames) + '.wsp' ) def validateArchiveList(self, archiveList): @@ -192,14 +200,22 @@ def __init__(self, settings): self.tree = ceres.CeresTree(self.data_dir) - def encode(self, metric): - return TaggedSeries.encode(metric, hash_only=self.tag_hash_filenames) + def encode(self, metric, tag_hash_filenames=None): + if tag_hash_filenames is None: + tag_hash_filenames = self.tag_hash_filenames + return TaggedSeries.encode(metric, hash_only=tag_hash_filenames) def write(self, metric, datapoints): self.tree.store(self.encode(metric), datapoints) def exists(self, metric): - return self.tree.hasNode(self.encode(metric)) + if self.tree.hasNode(self.encode(metric)): + return True + # if we're using hashed filenames and a non-hashed file exists then move it to the new name + if self.tag_hash_filenames and self.tree.hasNode(self.encode(metric, False)): + os.rename(self.getFilesystemPath(metric, False), self.getFilesystemPath(metric)) + return True + return False def create(self, metric, retentions, xfilesfactor, aggregation_method): self.tree.createNode(self.encode(metric), @@ -217,5 +233,5 @@ def setMetadata(self, metric, key, value): metadata[key] = value node.writeMetadata(metadata) - def getFilesystemPath(self, metric): - return self.tree.getFilesystemPath(self.encode(metric)) + def getFilesystemPath(self, metric, tag_hash_filenames=None): + return self.tree.getFilesystemPath(self.encode(metric, tag_hash_filenames)) diff --git a/lib/carbon/tests/test_database.py b/lib/carbon/tests/test_database.py index 0c97656e1..f5db0997c 100644 --- a/lib/carbon/tests/test_database.py +++ b/lib/carbon/tests/test_database.py @@ -1,6 +1,8 @@ import os from unittest import TestCase from mock import patch +from os.path import exists +import shutil from carbon.tests.util import TestSettings from carbon.database import WhisperDatabase, CeresDatabase @@ -23,25 +25,80 @@ def test_getFilesystemPath(self): self.assertEqual(result, '/tmp/stats/example/counts.wsp') # nosec def test_getTaggedFilesystemPath(self): + metric = 'stats.example.counts;tag1=value1' + settings = TestSettings() settings['LOCAL_DATA_DIR'] = '/tmp/' settings['TAG_HASH_FILENAMES'] = False database = WhisperDatabase(settings) - result = database.getFilesystemPath('stats.example.counts;tag1=value1') + + result = database.getFilesystemPath(metric) self.assertEqual( result, '/tmp/_tagged/872/252/stats_DOT_example_DOT_counts;tag1=value1.wsp') # nosec + result = database.exists(metric) + self.assertEqual(result, False) + def test_getTaggedFilesystemPathHashed(self): + metric = 'stats.example.counts;tag1=value1' + settings = TestSettings() settings['LOCAL_DATA_DIR'] = '/tmp/' settings['TAG_HASH_FILENAMES'] = True database = WhisperDatabase(settings) - result = database.getFilesystemPath('stats.example.counts;tag1=value1') + + result = database.getFilesystemPath(metric) self.assertEqual( result, '/tmp/_tagged/872/252/' + # nosec '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0.wsp') + result = database.exists(metric) + self.assertEqual(result, False) + + def test_migrateTaggedFilesystemPathHashed(self): + metric = 'stats.example.counts;tag1=value1' + + settings = TestSettings() + settings['LOCAL_DATA_DIR'] = '/tmp/' + settings['TAG_HASH_FILENAMES'] = False + database = WhisperDatabase(settings) + + result = database.exists(metric) + self.assertEqual(result, False) + + old_path = database.getFilesystemPath(metric) + self.assertEqual( + old_path, '/tmp/_tagged/872/252/stats_DOT_example_DOT_counts;tag1=value1.wsp') # nosec + + self.assertEqual(exists(old_path), False) + + result = database.create(metric, [(60, 60)], 0.5, 'average') + + self.assertEqual(exists(old_path), True) + + result = database.exists(metric) + self.assertEqual(result, True) + + settings['TAG_HASH_FILENAMES'] = True + database = WhisperDatabase(settings) + + hashed_path = database.getFilesystemPath(metric) + self.assertEqual( + hashed_path, + '/tmp/_tagged/872/252/' + # nosec + '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0.wsp') + + self.assertEqual(exists(hashed_path), False) + + result = database.exists(metric) + self.assertEqual(result, True) + + self.assertEqual(exists(old_path), False) + self.assertEqual(exists(hashed_path), True) + + os.remove(hashed_path) + class CeresDatabaseTest(TestCase): @@ -60,21 +117,76 @@ def test_getFilesystemPath(self): self.assertEqual(result, '/tmp/stats/example/counts') # nosec def test_getTaggedFilesystemPath(self): + metric = 'stats.example.counts;tag1=value1' + settings = TestSettings() settings['LOCAL_DATA_DIR'] = '/tmp/' settings['TAG_HASH_FILENAMES'] = False database = CeresDatabase(settings) - result = database.getFilesystemPath('stats.example.counts;tag1=value1') + + result = database.getFilesystemPath(metric) self.assertEqual( result, '/tmp/_tagged/872/252/stats_DOT_example_DOT_counts;tag1=value1') # nosec + result = database.exists(metric) + self.assertEqual(result, False) + def test_getTaggedFilesystemPathHashed(self): + metric = 'stats.example.counts;tag1=value1' + settings = TestSettings() settings['LOCAL_DATA_DIR'] = '/tmp/' settings['TAG_HASH_FILENAMES'] = True database = CeresDatabase(settings) - result = database.getFilesystemPath('stats.example.counts;tag1=value1') + + result = database.getFilesystemPath(metric) self.assertEqual( result, '/tmp/_tagged/872/252/' + # nosec '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0') + + result = database.exists(metric) + self.assertEqual(result, False) + + def test_migrateTaggedFilesystemPathHashed(self): + metric = 'stats.example.counts;tag1=value1' + + settings = TestSettings() + settings['LOCAL_DATA_DIR'] = '/tmp/' + settings['TAG_HASH_FILENAMES'] = False + database = CeresDatabase(settings) + + result = database.exists(metric) + self.assertEqual(result, False) + + old_path = database.getFilesystemPath(metric) + self.assertEqual( + old_path, '/tmp/_tagged/872/252/stats_DOT_example_DOT_counts;tag1=value1') # nosec + + self.assertEqual(exists(old_path), False) + + result = database.create(metric, [(60, 60)], 0.5, 'average') + + self.assertEqual(exists(old_path), True) + + result = database.exists(metric) + self.assertEqual(result, True) + + settings['TAG_HASH_FILENAMES'] = True + database = CeresDatabase(settings) + + hashed_path = database.getFilesystemPath(metric) + self.assertEqual( + hashed_path, + '/tmp/_tagged/872/252/' + # nosec + '872252dcead671982862f82a3b440f02aa8f525dd6d0f2921de0dc2b3e874ad0') + + self.assertEqual(exists(hashed_path), False) + + result = database.exists(metric) + self.assertEqual(result, True) + + self.assertEqual(exists(old_path), False) + self.assertEqual(exists(hashed_path), True) + + shutil.rmtree(hashed_path) From 6fb057ac97e79c669e4096744f28de97592a3731 Mon Sep 17 00:00:00 2001 From: Dan Cech Date: Wed, 14 Feb 2018 12:12:47 -0500 Subject: [PATCH 4/5] codacy fix --- lib/carbon/database.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/lib/carbon/database.py b/lib/carbon/database.py index 100bf832d..13a5ddad6 100644 --- a/lib/carbon/database.py +++ b/lib/carbon/database.py @@ -127,8 +127,8 @@ def exists(self, metric): if exists(self.getFilesystemPath(metric)): return True # if we're using hashed filenames and a non-hashed file exists then move it to the new name - if self.tag_hash_filenames and exists(self.getFilesystemPath(metric, False)): - os.rename(self.getFilesystemPath(metric, False), self.getFilesystemPath(metric)) + if self.tag_hash_filenames and exists(self._getFilesystemPath(metric, False)): + os.rename(self._getFilesystemPath(metric, False), self.getFilesystemPath(metric)) return True return False @@ -158,9 +158,10 @@ def setMetadata(self, metric, key, value): wsp_path = self.getFilesystemPath(metric) return whisper.setAggregationMethod(wsp_path, value) - def getFilesystemPath(self, metric, tag_hash_filenames=None): - if tag_hash_filenames is None: - tag_hash_filenames = self.tag_hash_filenames + def getFilesystemPath(self, metric): + return self._getFilesystemPath(metric, self.tag_hash_filenames) + + def _getFilesystemPath(self, metric, tag_hash_filenames): return join( self.data_dir, TaggedSeries.encode(metric, sep, hash_only=tag_hash_filenames) + '.wsp' @@ -213,7 +214,7 @@ def exists(self, metric): return True # if we're using hashed filenames and a non-hashed file exists then move it to the new name if self.tag_hash_filenames and self.tree.hasNode(self.encode(metric, False)): - os.rename(self.getFilesystemPath(metric, False), self.getFilesystemPath(metric)) + os.rename(self._getFilesystemPath(metric, False), self.getFilesystemPath(metric)) return True return False @@ -233,5 +234,8 @@ def setMetadata(self, metric, key, value): metadata[key] = value node.writeMetadata(metadata) - def getFilesystemPath(self, metric, tag_hash_filenames=None): + def getFilesystemPath(self, metric): + return self._getFilesystemPath(metric, self.tag_hash_filenames) + + def _getFilesystemPath(self, metric, tag_hash_filenames): return self.tree.getFilesystemPath(self.encode(metric, tag_hash_filenames)) From 0ae97942bd7bfe6dd96ceda1952439cd7bb9fc42 Mon Sep 17 00:00:00 2001 From: Denys Zhdanov Date: Sun, 11 Mar 2018 21:32:34 +0100 Subject: [PATCH 5/5] Adding dummy.txt to storage dirs --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index ceccf6974..74e89f9d7 100644 --- a/setup.py +++ b/setup.py @@ -58,9 +58,9 @@ setup_kwargs = dict() -storage_dirs = [ ('storage/ceres', []), ('storage/whisper',[]), - ('storage/lists',[]), ('storage/log',[]), - ('storage/rrd',[]) ] +storage_dirs = [ ('storage/ceres/dummy.txt', []), ('storage/whisper/dummy.txt',[]), + ('storage/lists',[]), ('storage/log/dummy.txt',[]), + ('storage/rrd/dummy.txt',[]) ] conf_files = [ ('conf', glob('conf/*.example')) ] install_files = storage_dirs + conf_files