[130058] trunk/dports/textproc/word2vec
hum at macports.org
hum at macports.org
Thu Dec 25 03:59:20 PST 2014
Revision: 130058
https://trac.macports.org/changeset/130058
Author: hum at macports.org
Date: 2014-12-25 03:59:20 -0800 (Thu, 25 Dec 2014)
Log Message:
-----------
word2vec: update to 20140915; disable mt variant; add livecheck
Modified Paths:
--------------
trunk/dports/textproc/word2vec/Portfile
trunk/dports/textproc/word2vec/files/patch-demo.diff
Modified: trunk/dports/textproc/word2vec/Portfile
===================================================================
--- trunk/dports/textproc/word2vec/Portfile 2014-12-25 10:53:41 UTC (rev 130057)
+++ trunk/dports/textproc/word2vec/Portfile 2014-12-25 11:59:20 UTC (rev 130058)
@@ -4,8 +4,7 @@
PortSystem 1.0
name word2vec
-version 20131218
-revision 1
+version 20140915
categories textproc
maintainers hum openmaintainer
@@ -23,9 +22,11 @@
fetch.type svn
svn.url http://word2vec.googlecode.com/svn/trunk
-svn.revision 37
+svn.revision 41
worksrcdir trunk
+depends_run port:wget
+
patchfiles patch-malloc.diff \
patch-compute-accuracy.c.diff \
patch-demo.diff
@@ -33,7 +34,7 @@
use_configure no
variant universal {}
-configure.optflags -O2
+configure.optflags -O3
build.args CC="${configure.cc}" \
CFLAGS="${configure.cflags} [get_canonical_archflags] -lm -pthread -Wall -funroll-loops"
@@ -63,16 +64,9 @@
${destroot}${docdir}
}
-variant mt description {Apply multiple threads patch} {
- distfiles-append word2vec.local.tgz:mt
- master_sites-append http://www.chokkan.org/software/word2vec-multi/:mt
- checksums rmd160 5c9092531f1c4d8f5482359e9d78f847adcd260c \
- sha256 57476a59f3f485ee5ada7214caf67fcbfa53f78283a7e85c5b6c764a96171844
- post-patch {
- system -W ${worksrcpath} "patch -p1 < ${workpath}/word2vec.local/word2vec.local.patch"
- }
-}
+variant mt description {disabled: Apply multiple threads patch} {}
-default_variants +mt
-
-livecheck.type none
+livecheck.type regex
+livecheck.url https://code.google.com/p/word2vec/source/list
+livecheck.version ${svn.revision}
+livecheck.regex r(\\d+)
Modified: trunk/dports/textproc/word2vec/files/patch-demo.diff
===================================================================
--- trunk/dports/textproc/word2vec/files/patch-demo.diff 2014-12-25 10:53:41 UTC (rev 130057)
+++ trunk/dports/textproc/word2vec/files/patch-demo.diff 2014-12-25 11:59:20 UTC (rev 130058)
@@ -1,116 +1,72 @@
---- demo-analogy.sh.orig 2014-02-22 20:36:04.000000000 +0900
-+++ demo-analogy.sh 2014-02-22 20:27:27.000000000 +0900
-@@ -1,11 +1,13 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
-- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
-- gzip -d text8.gz -f
-+ curl -O http://mattmahoney.net/dc/text8.zip
-+ unzip text8.zip
- fi
- echo -----------------------------------------------------------------------------------------------------
- echo Note that for the word analogy to perform well, the models should be trained on much larger data sets
+--- demo-analogy.sh.orig 2014-09-07 01:54:27.000000000 +0900
++++ demo-analogy.sh 2014-12-24 22:55:24.000000000 +0900
+@@ -7,5 +7,5 @@
+ echo Note that for the word analogy to perform well, the model should be trained on much larger data set
echo Example input: paris france berlin
- echo -----------------------------------------------------------------------------------------------------
--time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+ echo ---------------------------------------------------------------------------------------------------
+-time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
-./word-analogy vectors.bin
-+if [ ! -e vectors.bin ]; then
-+ time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-+fi
++time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
+ at EXECDIR@/word-analogy vectors.bin
---- demo-classes.sh.orig 2014-02-22 20:36:09.000000000 +0900
-+++ demo-classes.sh 2014-02-22 20:22:53.000000000 +0900
-@@ -1,8 +1,8 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
-- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
-- gzip -d text8.gz -f
-+ curl -O http://mattmahoney.net/dc/text8.zip
-+ unzip text8.zip
+--- demo-classes.sh.orig 2014-09-07 01:54:27.000000000 +0900
++++ demo-classes.sh 2014-12-24 22:57:00.000000000 +0900
+@@ -3,6 +3,6 @@
+ wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+ gzip -d text8.gz -f
fi
--time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
-+time @EXECDIR@/word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
+-time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
++time @EXECDIR@/word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
sort classes.txt -k 2 -n > classes.sorted.txt
echo The word classes were saved to file classes.sorted.txt
---- demo-phrase-accuracy.sh.orig 2014-02-22 20:36:25.000000000 +0900
-+++ demo-phrase-accuracy.sh 2014-02-22 20:29:40.000000000 +0900
-@@ -1,12 +1,14 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
-- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
-- gzip -d text8.gz -f
-+ curl -O http://mattmahoney.net/dc/text8.zip
-+ unzip text8.zip
+--- demo-phrase-accuracy.sh.orig 2014-09-07 01:54:27.000000000 +0900
++++ demo-phrase-accuracy.sh 2014-12-24 22:57:51.000000000 +0900
+@@ -4,8 +4,8 @@
+ gzip -d news.2012.en.shuffled.gz -f
fi
- echo ----------------------------------------------------------------------------------------------------------------
- echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
- echo To achieve better accuracy, larger training set is needed
- echo ----------------------------------------------------------------------------------------------------------------
--time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
--time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
--./compute-accuracy vectors-phrase.bin <questions-phrases.txt
-+if [ ! -e vectors-phrase.bin ]; then
-+ time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
-+ time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
-+fi
-+ at EXECDIR@/compute-accuracy vectors-phrase.bin < @EXDIR@/questions-phrases.txt
---- demo-phrases.sh.orig 2014-02-22 20:36:17.000000000 +0900
-+++ demo-phrases.sh 2014-02-22 20:30:19.000000000 +0900
-@@ -1,8 +1,10 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
-- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
-- gzip -d text8.gz -f
-+ curl -O http://mattmahoney.net/dc/text8.zip
-+ unzip text8.zip
+ sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
+-time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
+-time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
++time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
++time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
+ tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
+-time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
+-./compute-accuracy vectors-phrase.bin < questions-phrases.txt
++time @EXECDIR@/word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
++ at EXECDIR@/compute-accuracy vectors-phrase.bin < questions-phrases.txt
+--- demo-phrases.sh.orig 2014-09-07 01:54:27.000000000 +0900
++++ demo-phrases.sh 2014-12-24 22:58:20.000000000 +0900
+@@ -4,8 +4,8 @@
+ gzip -d news.2012.en.shuffled.gz -f
fi
--time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
--time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+ sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
+-time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
+-time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
++time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
++time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
+ tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
+-time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
-./distance vectors-phrase.bin
-\ No newline at end of file
-+if [ ! -e vectors-phrase.bin ]; then
-+ time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
-+ time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-+fi
++time @EXECDIR@/word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
+ at EXECDIR@/distance vectors-phrase.bin
---- demo-word-accuracy.sh.orig 2014-02-22 20:36:32.000000000 +0900
-+++ demo-word-accuracy.sh 2014-02-22 20:31:16.000000000 +0900
-@@ -1,8 +1,10 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
-- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
-- gzip -d text8.gz -f
-+ curl -O http://mattmahoney.net/dc/text8.zip
-+ unzip text8.zip
+--- demo-word-accuracy.sh.orig 2014-09-07 01:54:27.000000000 +0900
++++ demo-word-accuracy.sh 2014-12-24 22:58:49.000000000 +0900
+@@ -3,6 +3,6 @@
+ wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+ gzip -d text8.gz -f
fi
--time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
-./compute-accuracy vectors.bin 30000 < questions-words.txt
-# to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
-+if [ ! -e vectors.bin ]; then
-+ time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-+fi
-+ at EXECDIR@/compute-accuracy vectors.bin 30000 < @EXDIR@/questions-words.txt
-+# to compute accuracy with the full vocabulary, use: @EXECDIR@/compute-accuracy vectors.bin < @EXDIR@/questions-words.txt
---- demo-word.sh.orig 2014-02-22 20:36:47.000000000 +0900
-+++ demo-word.sh 2014-02-22 20:31:57.000000000 +0900
-@@ -1,7 +1,9 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
-- wget http://mattmahoney.net/dc/text8.zip -O text8.gz
-- gzip -d text8.gz -f
-+ curl -O http://mattmahoney.net/dc/text8.zip
-+ unzip text8.zip
++time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
++ at EXECDIR@/compute-accuracy vectors.bin 30000 < questions-words.txt
++# to compute accuracy with the full vocabulary, use: @EXECDIR@/compute-accuracy vectors.bin < questions-words.txt
+--- demo-word.sh.orig 2014-09-07 01:54:27.000000000 +0900
++++ demo-word.sh 2014-12-24 22:59:00.000000000 +0900
+@@ -3,5 +3,5 @@
+ wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+ gzip -d text8.gz -f
fi
--time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
-./distance vectors.bin
-\ No newline at end of file
-+if [ ! -e vectors.bin ]; then
-+ time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-+fi
++time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
+ at EXECDIR@/distance vectors.bin
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.macosforge.org/pipermail/macports-changes/attachments/20141225/fd35cf21/attachment.html>
More information about the macports-changes
mailing list