From 8e9df066a94721f899e6336d15320ec5d7e58a67 Mon Sep 17 00:00:00 2001 From: estromenko Date: Fri, 31 Oct 2025 00:43:11 +0300 Subject: [PATCH] Add duplicate telegram messages filtering --- vacancies/main/admin.py | 5 ---- ...ollect_vacancies_from_telegram_messages.py | 10 ++----- ...ove_recommendedvacancy_vacancy_and_more.py | 26 ++++++++++++++++ vacancies/main/models.py | 19 ++---------- vacancies/main/vector_store.py | 30 ++++++++++++++++++- 5 files changed, 60 insertions(+), 30 deletions(-) create mode 100644 vacancies/main/migrations/0007_remove_recommendedvacancy_vacancy_and_more.py diff --git a/vacancies/main/admin.py b/vacancies/main/admin.py index dccdb43..1438c23 100644 --- a/vacancies/main/admin.py +++ b/vacancies/main/admin.py @@ -1,11 +1,6 @@ from django.contrib import admin from vacancies.main import models -@admin.register(models.Vacancy) -class VacancyAdmin(admin.ModelAdmin): - pass - - @admin.register(models.Customer) class CustomerAdmin(admin.ModelAdmin): pass diff --git a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py index 784fb6a..305fdde 100644 --- a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py +++ b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py @@ -1,5 +1,4 @@ from django.core.management import BaseCommand -from vacancies.main.models import Vacancy import clickhouse_connect from vacancies.main.vector_store import add_vectors, extract_features, client as qdrant @@ -46,15 +45,12 @@ class Command(BaseCommand): (id, chat_username, telegram_id, message, timestamp) = row link = f"https://t.me/{chat_username}/{telegram_id}" - print(f"Processing {index}/{result_rows_len} link: {link}") + print(f"Processing {index+1}/{result_rows_len} link: {link}") features = extract_features(message) - vacancy, created = Vacancy.objects.get_or_create( - link=link, - ) add_vectors( "vacancies", - vacancy.id, + id, features.model_dump(), - {'content': message, 'features_json': features.model_dump()}, + {'content': message, 'features_json': features.model_dump(), "link": link, "timestamp": timestamp}, ) diff --git a/vacancies/main/migrations/0007_remove_recommendedvacancy_vacancy_and_more.py b/vacancies/main/migrations/0007_remove_recommendedvacancy_vacancy_and_more.py new file mode 100644 index 0000000..8f8453a --- /dev/null +++ b/vacancies/main/migrations/0007_remove_recommendedvacancy_vacancy_and_more.py @@ -0,0 +1,26 @@ +# Generated by Django 5.2.7 on 2025-10-30 21:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0006_remove_vacancy_features_json'), + ] + + operations = [ + migrations.RemoveField( + model_name='recommendedvacancy', + name='vacancy', + ), + migrations.AddField( + model_name='recommendedvacancy', + name='vacancy_id', + field=models.IntegerField(default=0), + preserve_default=False, + ), + migrations.DeleteModel( + name='Vacancy', + ), + ] diff --git a/vacancies/main/models.py b/vacancies/main/models.py index 836117b..e263677 100644 --- a/vacancies/main/models.py +++ b/vacancies/main/models.py @@ -31,30 +31,15 @@ class CustomerCV(models.Model): db_table = "customer_vcs" -class Vacancy(models.Model): - content = models.TextField() - link = models.URLField(unique=True) - created_at = models.DateTimeField(auto_now_add=True) - - objects = models.Manager() - - def __str__(self): - return self.content[:100] - - class Meta: - verbose_name_plural = 'Vacancies' - db_table = "vacancies" - - class RecommendedVacancy(models.Model): customer = models.ForeignKey(Customer, on_delete=models.CASCADE) - vacancy = models.ForeignKey(Vacancy, on_delete=models.CASCADE) + vacancy_id = models.IntegerField() created_at = models.DateTimeField(auto_now_add=True) objects = models.Manager() def __str__(self): - return f'{self.customer.username} -> {self.vacancy.content}' + return f'{self.customer.username} -> {self.vacancy_id}' class Meta: verbose_name_plural = 'Recommended Vacancies' diff --git a/vacancies/main/vector_store.py b/vacancies/main/vector_store.py index 82e43d1..c48e052 100644 --- a/vacancies/main/vector_store.py +++ b/vacancies/main/vector_store.py @@ -68,6 +68,32 @@ def add_vectors(collection_name: str, _id: int, features: dict, payload: dict): if text: vec = embedding.embed_query(text) vectors[name] = vec + + max_similarities = {} + for name, vec in vectors.items(): + if any(v != 0 for v in vec): + results = client.query_points( + collection_name="vacancies", + query=vec, + using=name, + limit=1000, + ) + for res in results.points: + vid = res.id + sim = res.score + if vid not in max_similarities: + max_similarities[vid] = {} + max_similarities[vid][name] = sim + + scored = [] + for vid, feature_sims in max_similarities.items(): + total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims) + scored.append({"id": vid, "score": total}) + + scored.sort(key=lambda x: x["score"], reverse=True) + if scored and scored[0]["score"] > 35: #treshold + return + client.upsert( collection_name=collection_name, points=[ @@ -114,6 +140,8 @@ def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]: scored.append({"id": vid, "score": total, "content": vacancies_content[vid]}) scored.sort(key=lambda x: x["score"], reverse=True) + for i in range(20): + print(f"{scored[i]['content']} {scored[i]['score']}") return scored[0]["id"], scored[0]["content"] @@ -137,7 +165,7 @@ def extract_features(content: str) -> VacancyFeatures: Vacancy content: {content} """ - openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal") + openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1) structured_llm = openai_client.with_structured_output(VacancyFeatures) response = structured_llm.invoke(prompt) return response