!wget $(lynx -dump http://skarbnicakaszubska.pl/najo-uczba/|grep pdf|awk '{print $NF}')

For the most part, the text extracted from the pdfs is fine as is; one of the files has multiple articles, several with translations, making it potentially useful as a parallel corpus.

The text (seems to) come out fine with pdftotext, so I haven't bothered doing anything else.

!pdftotext -nopgbrk -f 9 -l 10 ZKP_biuletynRJK_2015_internet.pdf - | grep -v 'BIU­L E­T IN RADZËZNË KASZËBSCZÉGÒ JÃZËKA 2015'|grep -v '^10'|grep -v '^$' > ZKP_biuletynRJK_2015_internet_1.csb.txt
!pdftotext -nopgbrk -f 11 -l 12 ZKP_biuletynRJK_2015_internet.pdf - | grep -v 'BIU­L E­T YN RADY JĘZYKA KASZUBSKIEGO 2015'|grep -v '^12'|grep -v '^$' > ZKP_biuletynRJK_2015_internet_1.pl.txt
!pdftotext -nopgbrk -f 14 -l 20 ZKP_biuletynRJK_2015_internet.pdf - | grep -v 'BIU­L E­T IN RADZËZNË KASZËBSCZÉGÒ JÃZËKA 2015'|grep -v '^Pòstanowienia Radzëznë Kaszëbsczégò Jãzëka'|grep -v '^$'|grep -v '^1[5-9]$'|grep -v '^20$' > ZKP_biuletynRJK_2015_internet_wl1.txt
!pdftotext -nopgbrk -f 21 -l 23 ZKP_biuletynRJK_2015_internet.pdf - | grep -v 'BIU­L E­T IN RADZËZNË KASZËBSCZÉGÒ JÃZËKA 2015'|grep -v '^Pòstanowienia Radzëznë Kaszëbsczégò Jãzëka'|grep -v '^$'|grep -v '^2[1-9]$' > ZKP_biuletynRJK_2015_internet_wl2.txt
!pdftotext -nopgbrk -f 24 -l 29 ZKP_biuletynRJK_2015_internet.pdf - | grep -v 'BIU­L E­T IN RADZËZNË KASZËBSCZÉGÒ JÃZËKA 2015'|grep -v '^Pòstanowienia Radzëznë Kaszëbsczégò Jãzëka'|grep -v '^$'|grep -v '^2[1-9]$' > ZKP_biuletynRJK_2015_internet_wl3.txt
!pdftotext -nopgbrk -f 30 -l 48 ZKP_biuletynRJK_2015_internet.pdf - | grep -v 'BIU­L E­T IN RADZËZNË KASZËBSCZÉGÒ JÃZËKA 2015'|grep -v '^Pòstanowienia Radzëznë Kaszëbsczégò Jãzëka'|grep -v '^$'|grep -v '^[34][0-9]$' > ZKP_biuletynRJK_2015_internet_wl4.txt
!pdftotext -nopgbrk -f 49 -l 49 ZKP_biuletynRJK_2015_internet.pdf - | grep -v 'BIU­L E­T IN RADZËZNË KASZËBSCZÉGÒ JÃZËKA 2015'|grep -v '^Pòstanowienia Radzëznë Kaszëbsczégò Jãzëka'|grep -v '^$'|grep -v '^[34][0-9]$' > ZKP_biuletynRJK_2015_internet_wl5.txt
!pdftotext -nopgbrk -f 50 -l 65 ZKP_biuletynRJK_2015_internet.pdf - | grep -v 'BIU­L E­T IN RADZËZNË KASZËBSCZÉGÒ JÃZËKA 2015'|grep -v '^Pòstanowienia Radzëznë Kaszëbsczégò Jãzëka'|grep -v '^$'|grep -v '^[56][0-9]$' > ZKP_biuletynRJK_2015_internet_wl6.txt
def runner(file, start, end, suffix):
    base = file.replace('.pdf', '')
    outfile = f"{base}_{suffix}.txt"
    !pdftotext -nopgbrk -f {start} -l {end} {file} - | grep -v 'BIU­L E­T IN RADZËZNË KASZËBSCZÉGÒ JÃZËKA 2015'|grep -v 'BIU­L E­T YN RADY JĘZYKA KASZUBSKIEGO 2015'|grep -v '^Pòstanowienia Radzëznë Kaszëbsczégò Jãzëka'|grep -v '^$'|grep -v '^[0-9][0-9]$'|grep -v '^[1-4][0-9][0-9]$' > {outfile}
runner('ZKP_biuletynRJK_2015_internet.pdf', 68, 74, 'wl7')
runner('ZKP_biuletynRJK_2015_internet.pdf', 75, 77, 'wl8')
runner('ZKP_biuletynRJK_2015_internet.pdf', 78, 83, 'wl9')
runner('ZKP_biuletynRJK_2015_internet.pdf', 84, 102, 'wl10')
runner('ZKP_biuletynRJK_2015_internet.pdf', 103, 103, 'wl11')
runner('ZKP_biuletynRJK_2015_internet.pdf', 104, 119, 'wl12')
runner('ZKP_biuletynRJK_2015_internet.pdf', 122, 128, 'csb2')
runner('ZKP_biuletynRJK_2015_internet.pdf', 129, 132, 'csb3')
runner('ZKP_biuletynRJK_2015_internet.pdf', 133, 144, 'csb4')
runner('ZKP_biuletynRJK_2015_internet.pdf', 145, 151, 'csb5')
runner('ZKP_biuletynRJK_2015_internet.pdf', 153, 161, 'csb6')
runner('ZKP_biuletynRJK_2015_internet.pdf', 162, 166, 'csb7')
runner('ZKP_biuletynRJK_2015_internet.pdf', 168, 178, 'csb8')
runner('ZKP_biuletynRJK_2015_internet.pdf', 179, 185, 'csb9')
# it took me this long to remember that there's a table of contents!
runner('ZKP_biuletynRJK_2015_internet.pdf', 186, 197, 'csb10')
runner('ZKP_biuletynRJK_2015_internet.pdf', 198, 204, 'csb11')
runner('ZKP_biuletynRJK_2015_internet.pdf', 205, 211, 'csb12')
runner('ZKP_biuletynRJK_2015_internet.pdf', 212, 220, 'csb13')
runner('ZKP_biuletynRJK_2015_internet.pdf', 222, 228, 'pl2')
runner('ZKP_biuletynRJK_2015_internet.pdf', 229, 237, 'plx1')
runner('ZKP_biuletynRJK_2015_internet.pdf', 238, 241, 'pl3')
runner('ZKP_biuletynRJK_2015_internet.pdf', 242, 248, 'plx2')
runner('ZKP_biuletynRJK_2015_internet.pdf', 249, 254, 'plx3')
runner('ZKP_biuletynRJK_2015_internet.pdf', 255, 266, 'pl4')
runner('ZKP_biuletynRJK_2015_internet.pdf', 267, 274, 'pl5')
runner('ZKP_biuletynRJK_2015_internet.pdf', 275, 283, 'pl6')
runner('ZKP_biuletynRJK_2015_internet.pdf', 284, 289, 'pl7')
runner('ZKP_biuletynRJK_2015_internet.pdf', 290, 300, 'plx4')
runner('ZKP_biuletynRJK_2015_internet.pdf', 301, 313, 'pl8')
runner('ZKP_biuletynRJK_2015_internet.pdf', 314, 320, 'pl9')
runner('ZKP_biuletynRJK_2015_internet.pdf', 5, 8, 'toc')
runner('ZKP_biuletynRJK_2015_internet.pdf', 321, 333, 'pl10')
runner('ZKP_biuletynRJK_2015_internet.pdf', 334, 359, 'plx5')
runner('ZKP_biuletynRJK_2015_internet.pdf', 360, 367, 'pl11')
runner('ZKP_biuletynRJK_2015_internet.pdf', 368, 374, 'pl12')
runner('ZKP_biuletynRJK_2015_internet.pdf', 375, 390, 'plx6')
runner('ZKP_biuletynRJK_2015_internet.pdf', 391, 396, 'plx7')
runner('ZKP_biuletynRJK_2015_internet.pdf', 397, 404, 'pl13')

Now, the rest

!for i in [0-9N]*.pdf;do pdftotext $i;done
uname=!uname -a
if not 'LAPTOP-6PFTN7M9' in uname:
    !rm *.pdf