Test getPDF Text

This commit is contained in:
2023-03-17 22:59:18 -04:00
parent ca7a0c026a
commit f9ee755a29
4 changed files with 51 additions and 16 deletions

View File

@@ -13,6 +13,27 @@
let
inherit (poetry2nix.legacyPackages.${system}) mkPoetryApplication mkPoetryEnv mkPoetryPackages defaultPoetryOverrides;
pkgs = nixpkgs.legacyPackages.${system};
overrides = defaultPoetryOverrides.extend
(self: super: {
nextcord = super.nextcord.overridePythonAttrs
(
old: {
propagatedBuildInputs = (
old.propagatedBuildInputs or []
) ++ [super.setuptools];
}
);
pdftotext = super.pdftotext.overridePythonAttrs
(
old: {
buildInputs = (
old.buildInputs or []
) ++ (with pkgs; [
poppler
]);
}
);
});
in rec {
hydraJobs = pkgs.lib.optionalAttrs
@@ -49,17 +70,7 @@ DOC
projectDir = self;
# TODO: Upload to poetry2nix
# https://github.com/nix-community/poetry2nix/blob/master/docs/edgecases.md
overrides = defaultPoetryOverrides.extend
(self: super: {
nextcord = super.nextcord.overridePythonAttrs
(
old: {
propagatedBuildInputs = (
old.propagatedBuildInputs or []
) ++ [super.setuptools];
}
);
});
inherit overrides;
};
default = self.packages.${system}.handyhelper;
};
@@ -70,8 +81,11 @@ DOC
poetry2nix.packages.${system}.poetry
(mkPoetryEnv {
projectDir = self;
inherit overrides;
})
sops
# For pdftotext
poppler
];
};
}

View File

@@ -5,6 +5,7 @@ import asyncio
import os
import requests as req
from bs4 import BeautifulSoup as soup
import pdftotext
bot = commands.Bot()
@@ -16,8 +17,6 @@ async def search(txt):
doc = soup(resp.text, 'html.parser')
print(doc.title)
if 'not found' in doc.find('title').get_text():
return None
@@ -29,6 +28,12 @@ async def search(txt):
'pdf': pdf[2:]
}
async def getPDF(url):
resp = req.get(url, stream=True)
pdf = resp.content
pages = pdftotext.PDF(pdf)
return "\n\n".join(pages)
@bot.event
async def on_ready():
print(f'We have logged in as {bot.user}')
@@ -47,8 +52,12 @@ async def summarize(
await interaction.followup.send(f"Unable to find article: {article}")
return
await interaction.followup.send(f"""Article Found: \n{resp['ref']}
PDF loc: {resp['pdf']}""")
await interaction.followup.send(f"Article Found: {resp['ref']}
Parsing PDF...")
await interaction.followup.edit_message(f"""Article Found: {resp['ref']}
```
{getPDF(resp['pdf'])[:1000]}
```""")
def main():
# TODO: Import bot token from env

13
poetry.lock generated
View File

@@ -677,6 +677,17 @@ files = [
qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
testing = ["docopt", "pytest (<6.0.0)"]
[[package]]
name = "pdftotext"
version = "2.2.2"
description = "Simple PDF text extraction"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "pdftotext-2.2.2.tar.gz", hash = "sha256:2a9aa89bc62022408781b39d188fabf5a3ad1103b6630f32c4e27e395f7966ee"},
]
[[package]]
name = "pexpect"
version = "4.8.0"
@@ -1156,4 +1167,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "270b70e2eeff44e2b0bb07902f333dcbc45bb8c453a16769f9fd45b5a70c4993"
content-hash = "37cd0a0c44f8d8dc60f31db3cd1c01303d4ca07f85282c6a1823ed8135714313"

View File

@@ -15,6 +15,7 @@ requests = "^2.28.2"
nextcord = "^2.4.1"
openai = "^0.27.2"
beautifulsoup4 = "^4.11.2"
pdftotext = "^2.2.2"
[tool.poetry.group.dev.dependencies]
ipython = "^8.11.0"