Add duplicate telegram messages filtering
This commit is contained in:
parent
9dceaaeccc
commit
8e9df066a9
@ -1,11 +1,6 @@
|
||||
from django.contrib import admin
|
||||
from vacancies.main import models
|
||||
|
||||
@admin.register(models.Vacancy)
|
||||
class VacancyAdmin(admin.ModelAdmin):
|
||||
pass
|
||||
|
||||
|
||||
@admin.register(models.Customer)
|
||||
class CustomerAdmin(admin.ModelAdmin):
|
||||
pass
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
from django.core.management import BaseCommand
|
||||
from vacancies.main.models import Vacancy
|
||||
import clickhouse_connect
|
||||
from vacancies.main.vector_store import add_vectors, extract_features, client as qdrant
|
||||
|
||||
@ -46,15 +45,12 @@ class Command(BaseCommand):
|
||||
(id, chat_username, telegram_id, message, timestamp) = row
|
||||
|
||||
link = f"https://t.me/{chat_username}/{telegram_id}"
|
||||
print(f"Processing {index}/{result_rows_len} link: {link}")
|
||||
print(f"Processing {index+1}/{result_rows_len} link: {link}")
|
||||
features = extract_features(message)
|
||||
vacancy, created = Vacancy.objects.get_or_create(
|
||||
link=link,
|
||||
)
|
||||
|
||||
add_vectors(
|
||||
"vacancies",
|
||||
vacancy.id,
|
||||
id,
|
||||
features.model_dump(),
|
||||
{'content': message, 'features_json': features.model_dump()},
|
||||
{'content': message, 'features_json': features.model_dump(), "link": link, "timestamp": timestamp},
|
||||
)
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
# Generated by Django 5.2.7 on 2025-10-30 21:43
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('main', '0006_remove_vacancy_features_json'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='recommendedvacancy',
|
||||
name='vacancy',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='recommendedvacancy',
|
||||
name='vacancy_id',
|
||||
field=models.IntegerField(default=0),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.DeleteModel(
|
||||
name='Vacancy',
|
||||
),
|
||||
]
|
||||
@ -31,30 +31,15 @@ class CustomerCV(models.Model):
|
||||
db_table = "customer_vcs"
|
||||
|
||||
|
||||
class Vacancy(models.Model):
|
||||
content = models.TextField()
|
||||
link = models.URLField(unique=True)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
|
||||
objects = models.Manager()
|
||||
|
||||
def __str__(self):
|
||||
return self.content[:100]
|
||||
|
||||
class Meta:
|
||||
verbose_name_plural = 'Vacancies'
|
||||
db_table = "vacancies"
|
||||
|
||||
|
||||
class RecommendedVacancy(models.Model):
|
||||
customer = models.ForeignKey(Customer, on_delete=models.CASCADE)
|
||||
vacancy = models.ForeignKey(Vacancy, on_delete=models.CASCADE)
|
||||
vacancy_id = models.IntegerField()
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
|
||||
objects = models.Manager()
|
||||
|
||||
def __str__(self):
|
||||
return f'{self.customer.username} -> {self.vacancy.content}'
|
||||
return f'{self.customer.username} -> {self.vacancy_id}'
|
||||
|
||||
class Meta:
|
||||
verbose_name_plural = 'Recommended Vacancies'
|
||||
|
||||
@ -68,6 +68,32 @@ def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
|
||||
if text:
|
||||
vec = embedding.embed_query(text)
|
||||
vectors[name] = vec
|
||||
|
||||
max_similarities = {}
|
||||
for name, vec in vectors.items():
|
||||
if any(v != 0 for v in vec):
|
||||
results = client.query_points(
|
||||
collection_name="vacancies",
|
||||
query=vec,
|
||||
using=name,
|
||||
limit=1000,
|
||||
)
|
||||
for res in results.points:
|
||||
vid = res.id
|
||||
sim = res.score
|
||||
if vid not in max_similarities:
|
||||
max_similarities[vid] = {}
|
||||
max_similarities[vid][name] = sim
|
||||
|
||||
scored = []
|
||||
for vid, feature_sims in max_similarities.items():
|
||||
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
||||
scored.append({"id": vid, "score": total})
|
||||
|
||||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||
if scored and scored[0]["score"] > 35: #treshold
|
||||
return
|
||||
|
||||
client.upsert(
|
||||
collection_name=collection_name,
|
||||
points=[
|
||||
@ -114,6 +140,8 @@ def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]:
|
||||
scored.append({"id": vid, "score": total, "content": vacancies_content[vid]})
|
||||
|
||||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||
for i in range(20):
|
||||
print(f"{scored[i]['content']} {scored[i]['score']}")
|
||||
return scored[0]["id"], scored[0]["content"]
|
||||
|
||||
|
||||
@ -137,7 +165,7 @@ def extract_features(content: str) -> VacancyFeatures:
|
||||
Vacancy content:
|
||||
{content}
|
||||
"""
|
||||
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal")
|
||||
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1)
|
||||
structured_llm = openai_client.with_structured_output(VacancyFeatures)
|
||||
response = structured_llm.invoke(prompt)
|
||||
return response
|
||||
|
||||
Loading…
Reference in New Issue
Block a user