diff --git a/lomakinae/.gitignore b/lomakinae/.gitignore new file mode 100644 index 0000000..7a60b85 --- /dev/null +++ b/lomakinae/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.pyc diff --git a/lomakinae/docs/01_report.md b/lomakinae/docs/01_report.md new file mode 100644 index 0000000..27029c2 --- /dev/null +++ b/lomakinae/docs/01_report.md @@ -0,0 +1,86 @@ +# Отчёт. Задание 1 - структуры данных + +## Цель + +Реализовать три структуры данных (связный список, хеш-таблица, BST) и экспериментально сравнить их +производительность на операциях insert / find / delete при случайном и отсортированном порядке входных +данных. + +## Параметры эксперимента + +| Параметр | Значение | +| ----------- | ------------------------------------ | +| N (записей) | 10 000 | +| Повторений | 5 | +| Поисков | 100 существующих + 10 несуществующих | +| Удалений | 50 | + +--- + +## Результаты + +### Случайные данные (shuffled) + +![shuffled](data/01/attachments/plot_shuffled.png) + +### Отсортированные данные (sorted) + +![sorted](data/01/attachments/plot_sorted.png) + +--- + +## Анализ + +### Деградация BST на отсортированных данных + +![bst comparison](data/01/attachments/plot_bst_comparison.png) + +На случайных данных BST - самая быстрая структура: insert 0.027 с. Случайный порядок вставки даёт сбалансированное +дерево глубиной ~log N, поэтому каждый новый узел находит своё место за O(log N) шагов. На отсортированных - 12.77 с, +то есть в ~473 раз медленнее. Причина: при последовательной вставке отсортированных ключей каждый новый узел уходит в +правое поддерево предыдущего. Дерево вырождается в цепочку глубиной N, и каждая вставка требует O(N) шагов вместо O(log N). + +### Хеш-таблица нечувствительна к порядку + +HashTable показывает практически одинаковое время в обоих режимах (insert: 0.033 с против 0.032 с). Это ожидаемо: +индекс бакета вычисляется через `hash(name)`, который не зависит от порядка вставки. Операции работают за O(1) +при любом входе. + +### Связный список медленен при поиске + +LinkedList не имеет никакой структуры для навигации - единственный способ найти запись это пройти список от головы до нужного узла. +Find всегда O(N) независимо от порядка данных: shuffled 0.041 с, sorted 0.039 с. Insert O(N^2) для всей выборки - перед каждой вставкой +нужно пройти весь список для проверки дубликата. На отсортированных данных LinkedList не меняет поведение, тогда как BST деградирует +до 12.77 с - в этом единственном сценарии LinkedList оказывается быстрее BST. + +### Удаление + +LinkedList - чтобы удалить узел, нужно пройти список от головы до нужного элемента и перешить next предшественника. +Это O(N) в любом случае, порядок данных не имеет значения. Shuffled: 0.027 с, sorted: 0.026 с - разница в пределах погрешности. + +HashTable - вычисляем индекс бакета через `hash(name)`, затем удаляем узел из связного списка этого бакета. Порядок вставки не +влияет на то, в каком бакете лежит запись, поэтому время стабильно в обоих режимах: 0.00033 с. + +BST - ищем узел спуском по дереву, затем обрабатываем три случая: нет потомков, один потомок, два потомка. На случайных данных +дерево сбалансировано, глубина ~log N, удаление занимает 0.00014 с. На отсортированных данных дерево вырождено в +цепочку - каждый узел уходил в правое поддерево при вставке, поэтому поиск удаляемого узла проходит через всю цепочку O(N). +Результат: 0.061 с, то есть в ~435 раз медленнее. + +--- + +## Вывод + +**Частые вставки** - HashTable. Время вставки не зависит от порядка и объёма данных: индекс бакета вычисляется за O(1), +вставка в бакет - тоже O(1). Подтверждают цифры: 0.033 с на shuffled и 0.032 с на sorted при N=10000. + +**Частый поиск** - HashTable. По той же причине: `hash(name)` сразу указывает на нужный бакет, линейный перебор не нужен. +Find: 0.00057 с на shuffled, 0.00070 с на sorted - стабильно при любом входе. + +**Получить данные в отсортированном порядке** - BST при случайном порядке вставки. Элементы размещаются по правилу BST +(слева меньшие корня, справа большие корня), поэтому обход по схеме левое поддерево -> корень -> правое поддерево возвращает +все записи в алфавитном порядке без дополнительной сортировки. Важное условие: данные должны вставляться в случайном +порядке, иначе дерево вырождается (см. деградацию BST на отсортированных данных). + +LinkedList сам по себе проигрывает по всем операциям из-за O(N\*\*2) на вставку и O(N) на поиск. Однако его идея лежит в основе +хеш-таблицы: каждый бакет - это связный список, через который разрешаются коллизии. Как самостоятельная структура данных для +справочника он неэффективен, но как строительный блок внутри HashTable - незаменим. diff --git a/lomakinae/docs/data/01/README.md b/lomakinae/docs/data/01/README.md new file mode 100644 index 0000000..3028774 --- /dev/null +++ b/lomakinae/docs/data/01/README.md @@ -0,0 +1,9 @@ +# Задание 1: структуры данных + +## Как запустить +```sh +python main.py +``` + +## Результаты +Графики генерируются автоматически в папку `attachments/`. diff --git a/lomakinae/docs/data/01/attachments/plot_bst_comparison.png b/lomakinae/docs/data/01/attachments/plot_bst_comparison.png new file mode 100644 index 0000000..2eedb28 Binary files /dev/null and b/lomakinae/docs/data/01/attachments/plot_bst_comparison.png differ diff --git a/lomakinae/docs/data/01/attachments/plot_shuffled.png b/lomakinae/docs/data/01/attachments/plot_shuffled.png new file mode 100644 index 0000000..e1e71ed Binary files /dev/null and b/lomakinae/docs/data/01/attachments/plot_shuffled.png differ diff --git a/lomakinae/docs/data/01/attachments/plot_sorted.png b/lomakinae/docs/data/01/attachments/plot_sorted.png new file mode 100644 index 0000000..53258b0 Binary files /dev/null and b/lomakinae/docs/data/01/attachments/plot_sorted.png differ diff --git a/lomakinae/docs/data/01/main.py b/lomakinae/docs/data/01/main.py new file mode 100644 index 0000000..60ad5c7 --- /dev/null +++ b/lomakinae/docs/data/01/main.py @@ -0,0 +1,6 @@ +from src.experiment import main_experiment +from src.plot import build_plots + +if __name__ == "__main__": + main_experiment() + build_plots() diff --git a/lomakinae/docs/data/01/results.csv b/lomakinae/docs/data/01/results.csv new file mode 100644 index 0000000..b57393b --- /dev/null +++ b/lomakinae/docs/data/01/results.csv @@ -0,0 +1,91 @@ +structure,mode,operation,run,time_sec +LinkedList,shuffled,insert,1,3.25562 +LinkedList,shuffled,find,1,0.040773 +LinkedList,shuffled,delete,1,0.026344 +HashTable,shuffled,insert,1,0.033497 +HashTable,shuffled,find,1,0.000593 +HashTable,shuffled,delete,1,0.000348 +BST,shuffled,insert,1,0.024071 +BST,shuffled,find,1,0.000218 +BST,shuffled,delete,1,0.000136 +LinkedList,shuffled,insert,2,3.454281 +LinkedList,shuffled,find,2,0.040282 +LinkedList,shuffled,delete,2,0.026526 +HashTable,shuffled,insert,2,0.031691 +HashTable,shuffled,find,2,0.000568 +HashTable,shuffled,delete,2,0.000338 +BST,shuffled,insert,2,0.024978 +BST,shuffled,find,2,0.000213 +BST,shuffled,delete,2,0.000135 +LinkedList,shuffled,insert,3,3.453681 +LinkedList,shuffled,find,3,0.0404 +LinkedList,shuffled,delete,3,0.026843 +HashTable,shuffled,insert,3,0.031902 +HashTable,shuffled,find,3,0.000536 +HashTable,shuffled,delete,3,0.000319 +BST,shuffled,insert,3,0.025369 +BST,shuffled,find,3,0.000219 +BST,shuffled,delete,3,0.000138 +LinkedList,shuffled,insert,4,3.417185 +LinkedList,shuffled,find,4,0.040816 +LinkedList,shuffled,delete,4,0.027023 +HashTable,shuffled,insert,4,0.037826 +HashTable,shuffled,find,4,0.000582 +HashTable,shuffled,delete,4,0.00033 +BST,shuffled,insert,4,0.036423 +BST,shuffled,find,4,0.000227 +BST,shuffled,delete,4,0.00014 +LinkedList,shuffled,insert,5,3.4723 +LinkedList,shuffled,find,5,0.040734 +LinkedList,shuffled,delete,5,0.027866 +HashTable,shuffled,insert,5,0.031981 +HashTable,shuffled,find,5,0.000546 +HashTable,shuffled,delete,5,0.000332 +BST,shuffled,insert,5,0.024578 +BST,shuffled,find,5,0.000227 +BST,shuffled,delete,5,0.000146 +LinkedList,sorted,insert,1,3.271489 +LinkedList,sorted,find,1,0.038886 +LinkedList,sorted,delete,1,0.026646 +HashTable,sorted,insert,1,0.030995 +HashTable,sorted,find,1,0.000625 +HashTable,sorted,delete,1,0.000302 +BST,sorted,insert,1,13.000812 +BST,sorted,find,1,0.128239 +BST,sorted,delete,1,0.06369 +LinkedList,sorted,insert,2,3.384572 +LinkedList,sorted,find,2,0.03915 +LinkedList,sorted,delete,2,0.026683 +HashTable,sorted,insert,2,0.032596 +HashTable,sorted,find,2,0.0006 +HashTable,sorted,delete,2,0.000315 +BST,sorted,insert,2,12.593249 +BST,sorted,find,2,0.10657 +BST,sorted,delete,2,0.058763 +LinkedList,sorted,insert,3,3.27816 +LinkedList,sorted,find,3,0.038938 +LinkedList,sorted,delete,3,0.025567 +HashTable,sorted,insert,3,0.03168 +HashTable,sorted,find,3,0.000631 +HashTable,sorted,delete,3,0.00031 +BST,sorted,insert,3,12.809241 +BST,sorted,find,3,0.110947 +BST,sorted,delete,3,0.062604 +LinkedList,sorted,insert,4,3.277437 +LinkedList,sorted,find,4,0.039812 +LinkedList,sorted,delete,4,0.025627 +HashTable,sorted,insert,4,0.031844 +HashTable,sorted,find,4,0.000917 +HashTable,sorted,delete,4,0.000383 +BST,sorted,insert,4,12.722063 +BST,sorted,find,4,0.111841 +BST,sorted,delete,4,0.060014 +LinkedList,sorted,insert,5,3.261706 +LinkedList,sorted,find,5,0.037981 +LinkedList,sorted,delete,5,0.025241 +HashTable,sorted,insert,5,0.032067 +HashTable,sorted,find,5,0.000742 +HashTable,sorted,delete,5,0.000342 +BST,sorted,insert,5,12.713176 +BST,sorted,find,5,0.108333 +BST,sorted,delete,5,0.059109 diff --git a/lomakinae/docs/data/01/src/__init__.py b/lomakinae/docs/data/01/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lomakinae/docs/data/01/src/bench.py b/lomakinae/docs/data/01/src/bench.py new file mode 100644 index 0000000..bccd26f --- /dev/null +++ b/lomakinae/docs/data/01/src/bench.py @@ -0,0 +1,72 @@ +import time +from .ll import ll_insert, ll_find, ll_delete +from .ht import ht_new, ht_insert, ht_find, ht_delete +from .bst import bst_insert, bst_find, bst_delete + + +def _build_ll(records): + head = None + for name, phone in records: + head = ll_insert(head, name, phone) + return head + + +def _build_ht(records): + buckets = ht_new() + for name, phone in records: + ht_insert(buckets, name, phone) + return buckets + + +def _build_bst(records): + root = None + for name, phone in records: + root = bst_insert(root, name, phone) + return root + + +def _time_insert(build_fn, records): + start = time.perf_counter() + structure = build_fn(records) + end = time.perf_counter() + elapsed = end - start + return elapsed, structure + + +def _time_find(find_fn, structure, names): + start = time.perf_counter() + for name in names: + find_fn(structure, name) + end = time.perf_counter() + elapsed = end - start + return elapsed + + +def _time_delete(delete_fn, structure, names): + start = time.perf_counter() + for name in names: + result = delete_fn(structure, name) + if result is not None: + structure = result + end = time.perf_counter() + elapsed = end - start + return elapsed, structure + + +def run_once(records, search_names, delete_names): + results = [] + + structures = { + 'LinkedList': (_build_ll, ll_find, ll_delete), + 'HashTable': (_build_ht, ht_find, ht_delete), + 'BST': (_build_bst, bst_find, bst_delete), + } + + for label, (build_fn, find_fn, delete_fn) in structures.items(): + t_insert, structure = _time_insert(build_fn, records) + t_find = _time_find(find_fn, structure, search_names) + t_delete, structure = _time_delete(delete_fn, structure, delete_names) + + results.append((label, t_insert, t_find, t_delete)) + + return results diff --git a/lomakinae/docs/data/01/src/bst.py b/lomakinae/docs/data/01/src/bst.py new file mode 100644 index 0000000..0446b06 --- /dev/null +++ b/lomakinae/docs/data/01/src/bst.py @@ -0,0 +1,65 @@ +def _bst_new_node(name, phone): + return {'name': name, 'phone': phone, 'left': None, 'right': None} + + +def bst_insert(root, name, phone): + if root is None: + return _bst_new_node(name, phone) + + if name == root['name']: + root['phone'] = phone + elif name < root['name']: + root['left'] = bst_insert(root['left'], name, phone) + else: + root['right'] = bst_insert(root['right'], name, phone) + + return root + + +def bst_find(root, name): + if root is None: + return None + + if name == root['name']: + return root['phone'] + elif name < root['name']: + return bst_find(root['left'], name) + else: + return bst_find(root['right'], name) + + +def _bst_min_node(root): + node = root + while node['left'] is not None: + node = node['left'] + return node + + +def bst_delete(root, name): + if root is None: + return None + + if name < root['name']: + root['left'] = bst_delete(root['left'], name) + elif name > root['name']: + root['right'] = bst_delete(root['right'], name) + else: + # node found — three cases + if root['left'] is None: + return root['right'] + if root['right'] is None: + return root['left'] + + # two children: replace node with in-order successor + successor = _bst_min_node(root['right']) + root['name'] = successor['name'] + root['phone'] = successor['phone'] + root['right'] = bst_delete(root['right'], successor['name']) + + return root + + +def bst_list_all(root): + if root is None: + return [] + return bst_list_all(root['left']) + [(root['name'], root['phone'])] + bst_list_all(root['right']) diff --git a/lomakinae/docs/data/01/src/experiment.py b/lomakinae/docs/data/01/src/experiment.py new file mode 100644 index 0000000..a91bd37 --- /dev/null +++ b/lomakinae/docs/data/01/src/experiment.py @@ -0,0 +1,58 @@ +import csv +import random +import sys +from pathlib import Path + +from .generator import generate_records, shuffle_records, sort_records, sample_existing, sample_nonexistent +from .bench import run_once + +N = 10000 +RUNS = 5 +SEARCH_K = 100 +SEARCH_MISSING_K = 10 +DELETE_K = 50 +sys.setrecursionlimit(15000) +BASE_DIR = Path(__file__).resolve().parent.parent +RESULT_PATH = BASE_DIR / "results.csv" + +def run_experiment(records, mode): + search_names = sample_existing(records, SEARCH_K) + sample_nonexistent(SEARCH_MISSING_K) + delete_names = sample_existing(records, DELETE_K) + + all_rows = [] + + for run_i in range(1, RUNS + 1): + print(f" [{mode}] run {run_i}/{RUNS} ...") + run_results = run_once(records, search_names, delete_names) + for label, t_insert, t_find, t_delete in run_results: + all_rows.append([label, mode, 'insert', run_i, round(t_insert, 6)]) + all_rows.append([label, mode, 'find', run_i, round(t_find, 6)]) + all_rows.append([label, mode, 'delete', run_i, round(t_delete, 6)]) + + return all_rows + + +def main_experiment(): + random.seed(52) + + records_base = generate_records(N) + records_shuffled = shuffle_records(records_base) + records_sorted = sort_records(records_base) + + rows = [] + rows += run_experiment(records_shuffled, 'shuffled') + rows += run_experiment(records_sorted, 'sorted') + + header = ['structure', 'mode', 'operation', 'run', 'time_sec'] + output_path = RESULT_PATH + + with open(output_path, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(header) + writer.writerows(rows) + + print(f"Done. Results saved to {output_path.name}") + print(f"Total rows: {len(rows)}") + +if __name__ == "__main__": + main_experiment() diff --git a/lomakinae/docs/data/01/src/generator.py b/lomakinae/docs/data/01/src/generator.py new file mode 100644 index 0000000..2b5ba79 --- /dev/null +++ b/lomakinae/docs/data/01/src/generator.py @@ -0,0 +1,27 @@ +import random + + +def generate_records(n): + records = [(f"User_{i:05d}", f"252-{i:05d}") for i in range(n)] + return records + + +def shuffle_records(records): + shuffled = records[:] + random.shuffle(shuffled) + return shuffled + + +def sort_records(records): + sorted_records = sorted(records) + return sorted_records + + +def sample_existing(records, k): + names = [name for name, _ in random.sample(records, k)] + return names + + +def sample_nonexistent(k): + ghosts = [f"None_{i:05d}" for i in range(k)] + return ghosts diff --git a/lomakinae/docs/data/01/src/ht.py b/lomakinae/docs/data/01/src/ht.py new file mode 100644 index 0000000..00442e2 --- /dev/null +++ b/lomakinae/docs/data/01/src/ht.py @@ -0,0 +1,34 @@ +from .ll import ll_insert, ll_find, ll_delete, ll_list_all + + +DEFAULT_SIZE = 128 + + +def ht_new(size=DEFAULT_SIZE): + return [None] * size + + +def _ht_index(buckets, name): + return hash(name) % len(buckets) + + +def ht_insert(buckets, name, phone): + i = _ht_index(buckets, name) + buckets[i] = ll_insert(buckets[i], name, phone) + + +def ht_find(buckets, name): + i = _ht_index(buckets, name) + return ll_find(buckets[i], name) + + +def ht_delete(buckets, name): + i = _ht_index(buckets, name) + buckets[i] = ll_delete(buckets[i], name) + + +def ht_list_all(buckets): + records = [] + for head in buckets: + records.extend(ll_list_all(head)) + return sorted(records) diff --git a/lomakinae/docs/data/01/src/ll.py b/lomakinae/docs/data/01/src/ll.py new file mode 100644 index 0000000..41d2eba --- /dev/null +++ b/lomakinae/docs/data/01/src/ll.py @@ -0,0 +1,50 @@ +def _ll_new_node(name, phone): + return {'name': name, 'phone': phone, 'next': None} + + +def ll_insert(head, name, phone): + node = head + while node is not None: + if node['name'] == name: + node['phone'] = phone + return head + node = node['next'] + + new_node = _ll_new_node(name, phone) + new_node['next'] = head + return new_node + + +def ll_find(head, name): + node = head + while node is not None: + if node['name'] == name: + return node['phone'] + node = node['next'] + return None + + +def ll_delete(head, name): + if head is None: + return None + + if head['name'] == name: + return head['next'] + + node = head + while node['next'] is not None: + if node['next']['name'] == name: + node['next'] = node['next']['next'] + return head + node = node['next'] + + return head + + +def ll_list_all(head): + records = [] + node = head + while node is not None: + records.append((node['name'], node['phone'])) + node = node['next'] + return sorted(records) diff --git a/lomakinae/docs/data/01/src/plot.py b/lomakinae/docs/data/01/src/plot.py new file mode 100644 index 0000000..93b37f3 --- /dev/null +++ b/lomakinae/docs/data/01/src/plot.py @@ -0,0 +1,80 @@ +import pandas as pd +import matplotlib.pyplot as plt +from pathlib import Path + +BASE_DIR = Path(__file__).resolve().parent.parent +DATA_PATH = BASE_DIR / "results.csv" +ATTACHMENTS_DIR = BASE_DIR / "attachments" + + +def shuffled_sorted_plots(structures, df): + for mode in ['shuffled', 'sorted']: + mode_title = 'перемешанные данные' if mode == 'shuffled' else 'отсортированные данные' + fig, axes = plt.subplots(1, 3, figsize=(15, 6)) + fig.suptitle(f'Производительность — {mode_title}') + + for ax, op in zip(axes, ['insert', 'find', 'delete']): + subset = df[(df['mode'] == mode) & (df['operation'] == op)] + structures_average = subset.groupby('structure')['time_sec'].mean() + means = [structures_average[s] for s in structures] + + bars = ax.bar(structures, means, color=['#4C72B0', '#55A868', '#C44E52']) + ax.set_title(op) + ax.set_ylabel('t (с)') + ax.set_yscale('log') + ax.grid(True, axis='y', alpha=0.3) + + for bar, val in zip(bars, means): + label = f'{val:.5f}' if val > 0.0001 else f'{val:.1e}' + ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), + label, ha='center', va='bottom', fontsize=9) + + plt.tight_layout() + save_path = ATTACHMENTS_DIR / f'plot_{mode}.png' + plt.savefig(save_path, dpi=300) + plt.close() + print(f'Saved: {save_path.name}') + + +def bst_shuffled_vs_sorted(df): + fig, ax = plt.subplots(figsize=(7, 5)) + fig.suptitle('Производительность bst_insert(): перемешанные и упорядоченные данные') + + bst_insert = df[(df['structure'] == 'BST') & (df['operation'] == 'insert')] + modes_average = bst_insert.groupby('mode')['time_sec'].mean() + modes = ['shuffled', 'sorted'] + means = [modes_average[m] for m in modes] + + bars = ax.bar(modes, means, color=['#4C72B0', '#C44E52']) + ax.set_ylabel('t (c)') + ax.set_yscale('log') + ax.grid(True, axis='y', alpha=0.3) + + for bar, val in zip(bars, means): + label = f'{val:.5f}' if val > 0.0001 else f'{val:.1e}' + ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), + label, ha='center', va='bottom', fontsize=10) + + plt.tight_layout() + save_path = ATTACHMENTS_DIR / 'plot_bst_comparison.png' + plt.savefig(save_path, dpi=300) + plt.close() + print(f'Saved: {save_path.name}') + + +def build_plots(): + ATTACHMENTS_DIR.mkdir(exist_ok=True) + + if not DATA_PATH.exists(): + raise ValueError(f"File not found: {DATA_PATH}") + + df = pd.read_csv(DATA_PATH) + + STRUCTURES = ['LinkedList', 'HashTable', 'BST'] + + shuffled_sorted_plots(STRUCTURES, df) + bst_shuffled_vs_sorted(df) + + +if __name__ == "__main__": + build_plots()