Integrate article parser

This commit is contained in:
2022-11-30 09:20:35 +03:00
parent 8cee3406ea
commit b8eb2b9bf5
7 changed files with 44 additions and 21 deletions

View File

@ -1,7 +1,5 @@
import requests
from bs4 import BeautifulSoup
from django.conf import settings
from newspaper import Article, Config
def sanitize_img_size(html: str):
@ -13,10 +11,10 @@ def sanitize_img_size(html: str):
def get_content(url: str):
if settings.READABILITY_HOST:
url = settings.READABILITY_HOST.rstrip(
'/') + '/api/content/v1/parser?url=' + url
try:
return requests.get(url).json()
except KeyError:
return None
config = Config()
config.keep_article_html = True
article = Article(url, config=config)
article.download()
article.parse()
return article

View File

@ -0,0 +1,18 @@
# Generated by Django 4.1.3 on 2022-11-30 06:10
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('later42', '0006_remove_url_content_remove_url_title_article_short_and_more'),
]
operations = [
migrations.AddField(
model_name='article',
name='img',
field=models.URLField(blank=True, null=True),
),
]

View File

@ -8,3 +8,4 @@ class Article(models.Model):
content = models.TextField(blank=True, null=True)
title = models.CharField(max_length=2000, blank=True, null=True)
short = models.TextField(blank=True, null=True)
img = models.URLField(blank=True, null=True)

View File

@ -21,8 +21,6 @@ if AIRBRAKE_PROJECT_ID is not None and AIRBRAKE_PROJECT_KEY is not None:
@shared_task()
def get_url_content_task(url, user_id):
print(url)
print(user_id)
user = User.objects.get(pk=int(user_id))
url_object = URL(url=url, user=user)
url_object.save()
@ -30,7 +28,10 @@ def get_url_content_task(url, user_id):
data = get_content(url)
article = Article.objects.create(url=url_object)
article.content = data['rich_content']
article.title = data['title']
article.short = data['excerpt']
article.content = data.article_html
article.title = data.title
article.short = data.text[:150]
if data.has_top_image():
article.img = data.top_image
article.save()

View File

@ -15,6 +15,13 @@
<a href="{% url 'archive_url' d.url_id %}"><span class="fa-regular fa-square-check"></span></a>
</div>
</div>
{% if d.img %}
<p class="post-preview">
<a href="{% url 'reader' d.url_id %}">
<img class="img-fluid" src="{{ d.img }}" width="100%" height="100%" />
</a>
</p>
{% endif %}
{% if d.short %}
<p class="post-meta">
{{ d.short }}

View File

@ -1,8 +1,5 @@
from multiprocessing import context
from django.contrib.auth.decorators import login_required
from django.shortcuts import render, redirect
from django.core.paginator import Paginator
from django.conf import settings
from django.shortcuts import render
from later42.libs.content import get_content, sanitize_img_size
from later42.models.article import Article
from later42.models.urls import URL
@ -15,11 +12,11 @@ def get(request, url_id=None):
content = {}
try:
article = Article.objects.get(url=url)
content['title'] = url.title
content['title'] = article.title
content['url'] = url.url
content['rich_content'] = sanitize_img_size(article.content)
except:
content = get_content(url.url)
content['rich_content'] = sanitize_img_size(content['rich_content'])
content['rich_content'] = sanitize_img_size(content.article_html)
context = {'url': url, 'content': content}
return render(request, 'reader.html', context)

View File

@ -10,3 +10,4 @@ six==1.16.0
celery[redis]==5.2.7
pybrake==1.10.0
sentry-sdk==1.11.0
newspaper3k==0.2.8