From a8b590ecbb8fde37a9ba9dfb2964c187b9948e9d Mon Sep 17 00:00:00 2001 From: Lex Berezhny Date: Sat, 22 Jun 2019 20:11:33 -0400 Subject: [PATCH] add tag normalization on storage and on search --- lbry/lbry/schema/tags.py | 13 +++++++++++++ lbry/lbry/wallet/server/db.py | 17 +++++++++-------- lbry/tests/integration/test_claim_commands.py | 12 ++++++------ lbry/tests/unit/schema/test_tags.py | 19 +++++++++++++++++++ 4 files changed, 47 insertions(+), 14 deletions(-) create mode 100644 lbry/lbry/schema/tags.py create mode 100644 lbry/tests/unit/schema/test_tags.py diff --git a/lbry/lbry/schema/tags.py b/lbry/lbry/schema/tags.py new file mode 100644 index 000000000..015836e1c --- /dev/null +++ b/lbry/lbry/schema/tags.py @@ -0,0 +1,13 @@ +from typing import List +import re + +MULTI_SPACE_RE = re.compile(r"\s{2,}") +WEIRD_CHARS_RE = re.compile(r"[#!~]") + + +def normalize_tag(tag: str): + return MULTI_SPACE_RE.sub(' ', WEIRD_CHARS_RE.sub(' ', tag.lower())).strip() + + +def clean_tags(tags: List[str]): + return [tag for tag in (normalize_tag(tag) for tag in tags) if tag] diff --git a/lbry/lbry/wallet/server/db.py b/lbry/lbry/wallet/server/db.py index 422ad6aaa..1dd0ca36d 100644 --- a/lbry/lbry/wallet/server/db.py +++ b/lbry/lbry/wallet/server/db.py @@ -10,6 +10,7 @@ from torba.server.util import class_logger from torba.client.basedatabase import query, constraints_to_sql from lbry.schema.url import URL, normalize_name +from lbry.schema.tags import clean_tags from lbry.schema.mime_types import guess_stream_type from lbry.wallet.ledger import MainNetLedger, RegTestLedger from lbry.wallet.transaction import Transaction, Output @@ -34,8 +35,8 @@ STREAM_TYPES = { } -def _apply_constraints_for_array_attributes(constraints, attr): - any_items = constraints.pop(f'any_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH] +def _apply_constraints_for_array_attributes(constraints, attr, cleaner): + any_items = cleaner(constraints.pop(f'any_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH] if any_items: constraints.update({ f'$any_{attr}{i}': item for i, item in enumerate(any_items) @@ -47,7 +48,7 @@ def _apply_constraints_for_array_attributes(constraints, attr): SELECT DISTINCT claim_hash FROM {attr} WHERE {attr} IN ({values}) """ - all_items = constraints.pop(f'all_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH] + all_items = cleaner(constraints.pop(f'all_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH] if all_items: constraints[f'$all_{attr}_count'] = len(all_items) constraints.update({ @@ -61,7 +62,7 @@ def _apply_constraints_for_array_attributes(constraints, attr): GROUP BY claim_hash HAVING COUNT({attr}) = :$all_{attr}_count """ - not_items = constraints.pop(f'not_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH] + not_items = cleaner(constraints.pop(f'not_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH] if not_items: constraints.update({ f'$not_{attr}{i}': item for i, item in enumerate(not_items) @@ -293,7 +294,7 @@ class SQLDB: elif claim.is_channel: claim_record['claim_type'] = CLAIM_TYPES['channel'] - for tag in claim.message.tags: + for tag in clean_tags(claim.message.tags): tags.append((tag, claim_hash, tx.height)) if clear_first: @@ -820,9 +821,9 @@ class SQLDB: if media_types: constraints['claim.media_type__in'] = media_types - _apply_constraints_for_array_attributes(constraints, 'tag') - _apply_constraints_for_array_attributes(constraints, 'language') - _apply_constraints_for_array_attributes(constraints, 'location') + _apply_constraints_for_array_attributes(constraints, 'tag', clean_tags) + _apply_constraints_for_array_attributes(constraints, 'language', lambda _: _) + _apply_constraints_for_array_attributes(constraints, 'location', lambda _: _) select = f"SELECT {cols} FROM claim" diff --git a/lbry/tests/integration/test_claim_commands.py b/lbry/tests/integration/test_claim_commands.py index 02deb346d..96cf80241 100644 --- a/lbry/tests/integration/test_claim_commands.py +++ b/lbry/tests/integration/test_claim_commands.py @@ -180,14 +180,14 @@ class ClaimSearchCommand(ClaimTestCase): self.assertEqual(out_of_bounds, []) async def test_tag_search(self): - claim1 = await self.stream_create('claim1', tags=['abc']) - claim2 = await self.stream_create('claim2', tags=['abc', 'def']) + claim1 = await self.stream_create('claim1', tags=['aBc']) + claim2 = await self.stream_create('claim2', tags=['#abc', 'def']) claim3 = await self.stream_create('claim3', tags=['abc', 'ghi', 'jkl']) - claim4 = await self.stream_create('claim4', tags=['abc', 'ghi', 'mno']) + claim4 = await self.stream_create('claim4', tags=['abc\t', 'ghi', 'mno']) claim5 = await self.stream_create('claim5', tags=['pqr']) # any_tags - await self.assertFindsClaims([claim5, claim4, claim3, claim2, claim1], any_tags=['abc', 'pqr']) + await self.assertFindsClaims([claim5, claim4, claim3, claim2, claim1], any_tags=['\tabc', 'pqr']) await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc']) await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc', 'ghi']) await self.assertFindsClaims([claim4, claim3], any_tags=['ghi']) @@ -196,7 +196,7 @@ class ClaimSearchCommand(ClaimTestCase): # all_tags await self.assertFindsClaims([], all_tags=['abc', 'pqr']) - await self.assertFindsClaims([claim4, claim3, claim2, claim1], all_tags=['abc']) + await self.assertFindsClaims([claim4, claim3, claim2, claim1], all_tags=['ABC']) await self.assertFindsClaims([claim4, claim3], all_tags=['abc', 'ghi']) await self.assertFindsClaims([claim4, claim3], all_tags=['ghi']) await self.assertFindsClaims([], all_tags=['ghi', 'xyz']) @@ -204,7 +204,7 @@ class ClaimSearchCommand(ClaimTestCase): # not_tags await self.assertFindsClaims([], not_tags=['abc', 'pqr']) - await self.assertFindsClaims([claim5], not_tags=['abc']) + await self.assertFindsClaims([claim5], not_tags=['abC']) await self.assertFindsClaims([claim5], not_tags=['abc', 'ghi']) await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi']) await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi', 'xyz']) diff --git a/lbry/tests/unit/schema/test_tags.py b/lbry/tests/unit/schema/test_tags.py new file mode 100644 index 000000000..f1517f41c --- /dev/null +++ b/lbry/tests/unit/schema/test_tags.py @@ -0,0 +1,19 @@ +import unittest + +from lbry.schema.tags import normalize_tag, clean_tags + + +class TestTagNormalization(unittest.TestCase): + + def assertNormalizedTag(self, clean, dirty): + self.assertEqual(clean, normalize_tag(dirty)) + + def test_normalize_tag(self): + tag = self.assertNormalizedTag + tag('', ' \t #!~') + tag('tag', 'Tag') + tag('t ag', '\tT \nAG ') + tag('tag hash', '#tag~#hash!') + + def test_clean_tags(self): + self.assertEqual(['tag'], clean_tags([' \t #!~', '!taG', '\t']))