Add duplicate telegram messages filtering
This commit is contained in:
parent
9dceaaeccc
commit
8e9df066a9
@ -1,11 +1,6 @@
|
|||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from vacancies.main import models
|
from vacancies.main import models
|
||||||
|
|
||||||
@admin.register(models.Vacancy)
|
|
||||||
class VacancyAdmin(admin.ModelAdmin):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@admin.register(models.Customer)
|
@admin.register(models.Customer)
|
||||||
class CustomerAdmin(admin.ModelAdmin):
|
class CustomerAdmin(admin.ModelAdmin):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
from django.core.management import BaseCommand
|
from django.core.management import BaseCommand
|
||||||
from vacancies.main.models import Vacancy
|
|
||||||
import clickhouse_connect
|
import clickhouse_connect
|
||||||
from vacancies.main.vector_store import add_vectors, extract_features, client as qdrant
|
from vacancies.main.vector_store import add_vectors, extract_features, client as qdrant
|
||||||
|
|
||||||
@ -46,15 +45,12 @@ class Command(BaseCommand):
|
|||||||
(id, chat_username, telegram_id, message, timestamp) = row
|
(id, chat_username, telegram_id, message, timestamp) = row
|
||||||
|
|
||||||
link = f"https://t.me/{chat_username}/{telegram_id}"
|
link = f"https://t.me/{chat_username}/{telegram_id}"
|
||||||
print(f"Processing {index}/{result_rows_len} link: {link}")
|
print(f"Processing {index+1}/{result_rows_len} link: {link}")
|
||||||
features = extract_features(message)
|
features = extract_features(message)
|
||||||
vacancy, created = Vacancy.objects.get_or_create(
|
|
||||||
link=link,
|
|
||||||
)
|
|
||||||
|
|
||||||
add_vectors(
|
add_vectors(
|
||||||
"vacancies",
|
"vacancies",
|
||||||
vacancy.id,
|
id,
|
||||||
features.model_dump(),
|
features.model_dump(),
|
||||||
{'content': message, 'features_json': features.model_dump()},
|
{'content': message, 'features_json': features.model_dump(), "link": link, "timestamp": timestamp},
|
||||||
)
|
)
|
||||||
|
|||||||
@ -0,0 +1,26 @@
|
|||||||
|
# Generated by Django 5.2.7 on 2025-10-30 21:43
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('main', '0006_remove_vacancy_features_json'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='recommendedvacancy',
|
||||||
|
name='vacancy',
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='recommendedvacancy',
|
||||||
|
name='vacancy_id',
|
||||||
|
field=models.IntegerField(default=0),
|
||||||
|
preserve_default=False,
|
||||||
|
),
|
||||||
|
migrations.DeleteModel(
|
||||||
|
name='Vacancy',
|
||||||
|
),
|
||||||
|
]
|
||||||
@ -31,30 +31,15 @@ class CustomerCV(models.Model):
|
|||||||
db_table = "customer_vcs"
|
db_table = "customer_vcs"
|
||||||
|
|
||||||
|
|
||||||
class Vacancy(models.Model):
|
|
||||||
content = models.TextField()
|
|
||||||
link = models.URLField(unique=True)
|
|
||||||
created_at = models.DateTimeField(auto_now_add=True)
|
|
||||||
|
|
||||||
objects = models.Manager()
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.content[:100]
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
verbose_name_plural = 'Vacancies'
|
|
||||||
db_table = "vacancies"
|
|
||||||
|
|
||||||
|
|
||||||
class RecommendedVacancy(models.Model):
|
class RecommendedVacancy(models.Model):
|
||||||
customer = models.ForeignKey(Customer, on_delete=models.CASCADE)
|
customer = models.ForeignKey(Customer, on_delete=models.CASCADE)
|
||||||
vacancy = models.ForeignKey(Vacancy, on_delete=models.CASCADE)
|
vacancy_id = models.IntegerField()
|
||||||
created_at = models.DateTimeField(auto_now_add=True)
|
created_at = models.DateTimeField(auto_now_add=True)
|
||||||
|
|
||||||
objects = models.Manager()
|
objects = models.Manager()
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'{self.customer.username} -> {self.vacancy.content}'
|
return f'{self.customer.username} -> {self.vacancy_id}'
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
verbose_name_plural = 'Recommended Vacancies'
|
verbose_name_plural = 'Recommended Vacancies'
|
||||||
|
|||||||
@ -68,6 +68,32 @@ def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
|
|||||||
if text:
|
if text:
|
||||||
vec = embedding.embed_query(text)
|
vec = embedding.embed_query(text)
|
||||||
vectors[name] = vec
|
vectors[name] = vec
|
||||||
|
|
||||||
|
max_similarities = {}
|
||||||
|
for name, vec in vectors.items():
|
||||||
|
if any(v != 0 for v in vec):
|
||||||
|
results = client.query_points(
|
||||||
|
collection_name="vacancies",
|
||||||
|
query=vec,
|
||||||
|
using=name,
|
||||||
|
limit=1000,
|
||||||
|
)
|
||||||
|
for res in results.points:
|
||||||
|
vid = res.id
|
||||||
|
sim = res.score
|
||||||
|
if vid not in max_similarities:
|
||||||
|
max_similarities[vid] = {}
|
||||||
|
max_similarities[vid][name] = sim
|
||||||
|
|
||||||
|
scored = []
|
||||||
|
for vid, feature_sims in max_similarities.items():
|
||||||
|
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
||||||
|
scored.append({"id": vid, "score": total})
|
||||||
|
|
||||||
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||||
|
if scored and scored[0]["score"] > 35: #treshold
|
||||||
|
return
|
||||||
|
|
||||||
client.upsert(
|
client.upsert(
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
points=[
|
points=[
|
||||||
@ -114,6 +140,8 @@ def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]:
|
|||||||
scored.append({"id": vid, "score": total, "content": vacancies_content[vid]})
|
scored.append({"id": vid, "score": total, "content": vacancies_content[vid]})
|
||||||
|
|
||||||
scored.sort(key=lambda x: x["score"], reverse=True)
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||||
|
for i in range(20):
|
||||||
|
print(f"{scored[i]['content']} {scored[i]['score']}")
|
||||||
return scored[0]["id"], scored[0]["content"]
|
return scored[0]["id"], scored[0]["content"]
|
||||||
|
|
||||||
|
|
||||||
@ -137,7 +165,7 @@ def extract_features(content: str) -> VacancyFeatures:
|
|||||||
Vacancy content:
|
Vacancy content:
|
||||||
{content}
|
{content}
|
||||||
"""
|
"""
|
||||||
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal")
|
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1)
|
||||||
structured_llm = openai_client.with_structured_output(VacancyFeatures)
|
structured_llm = openai_client.with_structured_output(VacancyFeatures)
|
||||||
response = structured_llm.invoke(prompt)
|
response = structured_llm.invoke(prompt)
|
||||||
return response
|
return response
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user