diff --git a/drivers/elastic.py b/drivers/elastic.py index b9863d2..055e55b 100644 --- a/drivers/elastic.py +++ b/drivers/elastic.py @@ -112,4 +112,54 @@ class Elastic_Testing(DB_Testing): } } } - return f"Got {driver.search(index=INDEX, body=query).body['aggregations']['attachment_count']['value']}" \ No newline at end of file + return f"Got {driver.search(index=INDEX, body=query).body['aggregations']['attachment_count']['value']}" + + def update_add_replies_per_last_name(self, driver: Elasticsearch) -> str: + agg_query = { + "size": 0, + "aggs": { + "reply_count_by_last_name": { + "terms": { + "script": { + "source": "String name = doc['sender_name.keyword'].value; String[] parts = /\\s+/.split(name); return parts[parts.length - 1]", + "lang": "painless" + }, + "size": 1000 + }, + "aggs": { + "total_reply_count": { + "sum": { + "field": "reply_count" + } + } + } + } + } + } + + agg_results = driver.search(index=INDEX, body=agg_query) + + buckets = agg_results['aggregations']['reply_count_by_last_name']['buckets'] + reply_counts = {bucket['key']: bucket['total_reply_count']['value'] for bucket in buckets} + + update_query = { + "script": { + "source": """ + String name = ctx._source.sender_name; + String[] parts = /\\s+/.split(name); + String lastName = parts[parts.length - 1]; + if (params.replyCounts.containsKey(lastName)) { + ctx._source.total_reply_count = params.replyCounts.get(lastName); + } + """, + "lang": "painless", + "params": { + "replyCounts": reply_counts + } + }, + "query": { + "match_all": {} + } + } + + driver.update_by_query(index=INDEX, body=update_query, conflicts="proceed") \ No newline at end of file diff --git a/drivers/psql.py b/drivers/psql.py index 30e9b78..0ffe969 100644 --- a/drivers/psql.py +++ b/drivers/psql.py @@ -29,6 +29,8 @@ class PSQL_Thread(Data_Thread): class PSQL_Testing(DB_Testing): driver_name = "PSQL" + + def __init__(self) -> None: super().__init__() self.data = [PSQL_Thread(x) for x in Threads] @@ -109,4 +111,29 @@ class PSQL_Testing(DB_Testing): cursor.execute(searchQuery) driver.commit() return f"Got {cursor.fetchone()[0]}" - \ No newline at end of file + + def update_add_replies_per_last_name(self, driver: connection) -> str: + cursor = driver.cursor() + alterQuery = """ + ALTER TABLE thread ADD COLUMN last_name_replies INT; + """ + updateQuery = """ + WITH last_name_sums AS ( + SELECT + recipient_name, + SUM(reply_count) OVER (PARTITION BY SPLIT_PART(recipient_name, ' ', -1)) AS total_replies + FROM + thread + ) + UPDATE thread AS t + SET last_name_replies = ln_sums.total_replies + FROM last_name_sums AS ln_sums + WHERE t.recipient_name = ln_sums.recipient_name; + """ + unalterQuery = """ + ALTER TABLE thread DROP COLUMN last_name_replies; + """ + cursor.execute(alterQuery) + cursor.execute(updateQuery) + cursor.execute(unalterQuery) + driver.commit() \ No newline at end of file diff --git a/interfaces/db_testing_interface.py b/interfaces/db_testing_interface.py index 17a1b12..02019c6 100644 --- a/interfaces/db_testing_interface.py +++ b/interfaces/db_testing_interface.py @@ -5,7 +5,7 @@ from utils.utils import time_func from time import sleep T = TypeVar('T') -NUM_RUNS = 10000 +NUM_RUNS = 10 class DB_Testing(Generic[T]): driver_name: str @@ -50,6 +50,10 @@ class DB_Testing(Generic[T]): def get_sum_attachment_less_5(self, driver: T) -> str: pass + @abstractmethod + def update_add_replies_per_last_name(self, driver: T) -> str: + pass + def do_tests(self, driver: T) -> Dict[str, float]: stats = {} @@ -63,9 +67,15 @@ class DB_Testing(Generic[T]): sleep(1) # To wait for any indexing to avoid race conditions somehow # Multiple + + # Search stats.update(time_func(f"[{self.driver_name}] Count mails with more than 5 attachments and Mr/Mrs", "attach5_mr_mrs", lambda : self.attach5_mr_mrs(driver), NUM_RUNS)) stats.update(time_func(f"[{self.driver_name}] Sum the number of attachment from mail that have less than 5 attachments", "get_sum_attachment_less_5", lambda : self.get_sum_attachment_less_5(driver), NUM_RUNS)) stats.update(time_func(f"[{self.driver_name}] Count mails that have a lorem sentence", "search_lorem", lambda : self.search_lorem(driver), NUM_RUNS)) stats.update(time_func(f"[{self.driver_name}] Count mails that have a mail that ends with microsoft.com", "search_mails_ends_microsoftcom", lambda : self.search_mails_ends_microsoftcom(driver), NUM_RUNS)) + + # Update + stats.update(time_func(f"[{self.driver_name}] Update all thread to add a field with the total replies per last name", "update_add_replies_per_last_name", lambda : self.update_add_replies_per_last_name(driver), NUM_RUNS)) + print("\n") return stats \ No newline at end of file diff --git a/plot.png b/plot.png new file mode 100644 index 0000000..481814e Binary files /dev/null and b/plot.png differ