blob: 104917b4e5a03a30d4e1da718922800623692050 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
|
# Copyright 2020-2021 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
EAPI=8
inherit check-reqs
DESCRIPTION="Data files for NLTK"
HOMEPAGE="https://www.nltk.org/nltk_data/"
# at least some of the files have poorly documented licenses
# TODO: create a USE flag for free-ish subset
LICENSE="all-rights-reserved"
SLOT="0"
KEYWORDS="amd64 x86"
IUSE="extra"
RESTRICT="bindist mirror"
BDEPEND="app-arch/unzip"
PACKAGES_ZIP_2020=(
# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort
corpora/comtrans
corpora/conll2007
corpora/jeita
corpora/knbc
corpora/machado
corpora/masc_tagged
corpora/nombank.1.0
corpora/panlex_swadesh
corpora/propbank
corpora/reuters
corpora/semcor
corpora/universal_treebanks_v20
sentiment/vader_lexicon
stemmers/snowball_data
)
PACKAGES_UNPACK_2020=(
# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort
corpora/abc
corpora/alpino
corpora/brown
corpora/cess_cat
corpora/cess_esp
corpora/chat80
corpora/city_database
corpora/cmudict
corpora/comparative_sentences
corpora/conll2000
corpora/conll2002
corpora/crubadan
corpora/dependency_treebank
corpora/dolch
corpora/europarl_raw
corpora/floresta
corpora/framenet_v15
corpora/framenet_v17
corpora/gazetteers
corpora/genesis
corpora/gutenberg
corpora/ieer
corpora/indian
corpora/lin_thesaurus
corpora/mac_morpho
corpora/movie_reviews
corpora/mte_teip5
corpora/names
corpora/nonbreaking_prefixes
corpora/nps_chat
corpora/omw
corpora/opinion_lexicon
corpora/pl196x
corpora/ppattach
corpora/product_reviews_1
corpora/product_reviews_2
corpora/pros_cons
corpora/ptb
corpora/qc
corpora/rte
corpora/senseval
corpora/sentence_polarity
corpora/sentiwordnet
corpora/shakespeare
corpora/state_union
corpora/subjectivity
corpora/swadesh
corpora/switchboard
corpora/timit
corpora/toolbox
corpora/treebank
corpora/twitter_samples
corpora/udhr
corpora/udhr2
corpora/verbnet
corpora/webtext
corpora/wordnet
corpora/wordnet_ic
corpora/words
grammars/book_grammars
grammars/large_grammars
grammars/sample_grammars
misc/perluniprops
models/bllip_wsj_no_aux
models/moses_sample
models/wmt15_eval
models/word2vec_sample
stemmers/porter_test
stemmers/rslp
taggers/averaged_perceptron_tagger
taggers/averaged_perceptron_tagger_ru
taggers/universal_tagset
tokenizers/punkt
)
PACKAGES_UNPACK_2021=(
corpora/stopwords
)
PACKAGES_UNPACK_2021_12=(
corpora/inaugural
corpora/omw-1.4
corpora/sinica_treebank
corpora/wordnet2021
corpora/wordnet31
)
PACKAGES_UNPACK_EXTRA_2020=(
chunkers/maxent_ne_chunker
corpora/biocreative_ppi
corpora/brown_tei
corpora/kimmo
corpora/paradigms
corpora/pe08
corpora/pil
corpora/problem_reports
corpora/smultron
corpora/unicode_samples
corpora/verbnet3
corpora/ycoe
grammars/basque_grammars
grammars/spanish_grammars
help/tagsets
misc/mwa_ppdb
taggers/maxent_treebank_pos_tagger
)
add_data() {
local x version=${1}
shift
for x; do
SRC_URI+="
https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${x}.zip
-> nltk-${x#*/}-${version}.zip"
done
}
add_data 20200312 "${PACKAGES_ZIP_2020[@]}" "${PACKAGES_UNPACK_2020[@]}"
add_data 20211023 "${PACKAGES_UNPACK_2021[@]}"
add_data 20211221 "${PACKAGES_UNPACK_2021_12[@]}"
SRC_URI+="
extra? ("
add_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}"
SRC_URI+="
)"
CHECKREQS_DISK_USR=3G
CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR}
unpack_data() {
local x version=${1}
shift
for x; do
local cat=${x%/*}
local pkg=${x#*/}
mkdir -p "${S}/${cat}" || die
cd "${S}/${cat}" || die
unpack "nltk-${pkg}-${version}.zip"
done
}
src_unpack() {
unpack_data 20200312 "${PACKAGES_UNPACK_2020[@]}"
unpack_data 20211023 "${PACKAGES_UNPACK_2021[@]}"
unpack_data 20211221 "${PACKAGES_UNPACK_2021_12[@]}"
use extra && unpack_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}"
}
install_zips() {
local x version=${1}
shift
for x; do
local cat=${x%/*}
local pkg=${x#*/}
insinto "/usr/share/nltk_data/${cat}"
newins "${DISTDIR}/nltk-${pkg}-${version}.zip" "${pkg}.zip"
done
}
src_install() {
dodir /usr/share/nltk_data
mv * "${ED}/usr/share/nltk_data/" || die
install_zips 20200312 "${PACKAGES_ZIP_2020[@]}"
}
|