vision-career/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py

61 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
from itertools import batched
import clickhouse_connect
from django.core.management import BaseCommand
from qdrant_client.models import OrderBy
from vacancies.conf.settings import CLICKHOUSE_HOST, CLICKHOUSE_PORT
from vacancies.main.vector_store import (
add_vectors,
batch_extract_features,
embed_features,
qdrant_client,
)
BATCH_SIZE = 50
clickhouse_client = clickhouse_connect.create_client(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT)
query = """
SELECT DISTINCT ON (message) id, chat_username, telegram_id, message, timestamp
FROM telegram_parser_chatmessage
WHERE timestamp >= %(timestamp)s
AND length(message) > 150
AND arrayCount(x -> position(message, x) > 0, [
'ваканси', 'ищем', 'требуется', 'разработчик', 'будет плюсом',
'зарплат', 'оклад', 'з/п', 'руб', 'опыт',
'требовани', 'обязанности', 'условия', 'офис',
'удаленн', 'гибкий график', 'полный день', 'занятост',
'резюме', 'собеседовани', 'junior', 'middle', 'senior', 'ждем', 'компани',
'заниматься', 'формат', 'занятость', 'вилка', 'должност', 'контакт'
]) >= 5
AND arrayCount(x -> position(lower(message), x) > 0, [
'о себе', 'обо мне', 'умею', '#ищу', '#резюме', 'университет', 'колледж'
]) = 0
ORDER BY timestamp ASC
"""
class Command(BaseCommand):
help = "Collect vacancies from telegram messages"
def handle(self, *args, **options):
response = qdrant_client.scroll(collection_name="vacancies", limit=1, order_by=OrderBy(key="timestamp", direction="desc"))
last_point_timestamp = datetime.now() - timedelta(days=30)
if response[0]:
last_point_timestamp = response[0][0].payload["timestamp"]
result_rows = clickhouse_client.query(query, parameters={"timestamp": last_point_timestamp}).result_rows
for index, rows in enumerate(batched(result_rows, BATCH_SIZE)):
vacancies_features = batch_extract_features([row[3] for row in rows])
with ThreadPoolExecutor() as pool:
vacancies_vectors = pool.map(embed_features, [vacancy_features.model_dump() for vacancy_features in vacancies_features])
for (id, chat_username, telegram_id, message, timestamp), vacancy_features, vacancy_vectors in zip(rows, vacancies_features, vacancies_vectors):
payload = {'content': message, 'features_json': vacancy_features.model_dump(), "link": f"https://t.me/{chat_username}/{telegram_id}", "timestamp": timestamp}
add_vectors("vacancies", id, vacancy_features.model_dump(), payload, vacancy_vectors)