vision-career/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py
estromenko 6b9267af02
Some checks failed
release / docker (push) Failing after 48s
Move ClickHouse and Qdrant clients to settings
2025-11-01 17:49:38 +03:00

57 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from django.core.management import BaseCommand
from vacancies.main.vector_store import add_vectors, extract_features, client as qdrant
from vacancies.conf.settings import CLICKHOUSE_CLIENT
query = """
SELECT id, chat_username, telegram_id, message, timestamp
FROM telegram_parser_chatmessage
WHERE timestamp >= now() - INTERVAL 30 DAY
AND length(message) > 150
AND arrayCount(x -> position(message, x) > 0, [
'вакансия', 'ищем', 'требуется', 'разработчик', 'будет плюсом',
'зарплата', 'оклад', 'з/п', 'руб', 'опыт работы',
'требования', 'обязанности', 'условия', 'компания', 'офис',
'удаленно', 'гибкий график', 'полный день', 'частичная занятость',
'резюме', 'собеседование', 'junior', 'middle', 'senior'
]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0
AND position(message, '#ищу') = 0 AND position(message, 'умею') = 0
AND id NOT IN %(exist_points)s
"""
class Command(BaseCommand):
help = "Collect vacancies from telegram messages"
def handle(self, *args, **options):
next_page_offset = 0
exist_points_ids = [-1]
while next_page_offset is not None:
response = qdrant.scroll(
collection_name="vacancies",
limit=100_000,
offset=next_page_offset,
with_payload=False,
with_vectors=False,
timeout=30,
)
exist_points_ids.extend([point.id for point in response[0]])
next_page_offset = response[1]
exist_points_set = tuple(set(exist_points_ids))
result_rows = CLICKHOUSE_CLIENT.query(query, parameters={"exist_points": exist_points_set}).result_rows
result_rows_len = len(result_rows)
for index, row in enumerate(result_rows):
(id, chat_username, telegram_id, message, timestamp) = row
link = f"https://t.me/{chat_username}/{telegram_id}"
print(f"Processing {index+1}/{result_rows_len} link: {link}")
features = extract_features(message)
add_vectors(
"vacancies",
id,
features.model_dump(),
{'content': message, 'features_json': features.model_dump(), "link": link, "timestamp": timestamp},
)