[130058] trunk/dports/textproc/word2vec

Thu Dec 25 03:59:20 PST 2014

Revision: 130058
          https://trac.macports.org/changeset/130058
Author:   hum at macports.org
Date:     2014-12-25 03:59:20 -0800 (Thu, 25 Dec 2014)
Log Message:
-----------
word2vec: update to 20140915; disable mt variant; add livecheck

Modified Paths:
--------------
    trunk/dports/textproc/word2vec/Portfile
    trunk/dports/textproc/word2vec/files/patch-demo.diff

Modified: trunk/dports/textproc/word2vec/Portfile
===================================================================

--- trunk/dports/textproc/word2vec/Portfile	2014-12-25 10:53:41 UTC (rev 130057)
+++ trunk/dports/textproc/word2vec/Portfile	2014-12-25 11:59:20 UTC (rev 130058)
@@ -4,8 +4,7 @@
 PortSystem          1.0
 
 name                word2vec
-version             20131218
-revision            1
+version             20140915
 categories          textproc
 maintainers         hum openmaintainer
 
@@ -23,9 +22,11 @@
 
 fetch.type          svn
 svn.url             http://word2vec.googlecode.com/svn/trunk
-svn.revision        37
+svn.revision        41
 worksrcdir          trunk
 
+depends_run         port:wget
+
 patchfiles          patch-malloc.diff \
                     patch-compute-accuracy.c.diff \
                     patch-demo.diff
@@ -33,7 +34,7 @@
 use_configure       no
 variant universal   {}
 
-configure.optflags  -O2
+configure.optflags  -O3
 
 build.args          CC="${configure.cc}" \
                     CFLAGS="${configure.cflags} [get_canonical_archflags] -lm -pthread -Wall -funroll-loops"
@@ -63,16 +64,9 @@
         ${destroot}${docdir}
 }
 
-variant mt description {Apply multiple threads patch} {
-    distfiles-append    word2vec.local.tgz:mt
-    master_sites-append http://www.chokkan.org/software/word2vec-multi/:mt
-    checksums           rmd160  5c9092531f1c4d8f5482359e9d78f847adcd260c \
-                        sha256  57476a59f3f485ee5ada7214caf67fcbfa53f78283a7e85c5b6c764a96171844
-    post-patch {
-        system -W ${worksrcpath} "patch -p1 < ${workpath}/word2vec.local/word2vec.local.patch"
-    }
-}
+variant mt description {disabled: Apply multiple threads patch} {}
 
-default_variants    +mt
-
-livecheck.type      none
+livecheck.type      regex
+livecheck.url       https://code.google.com/p/word2vec/source/list
+livecheck.version   ${svn.revision}
+livecheck.regex     r(\\d+)

Modified: trunk/dports/textproc/word2vec/files/patch-demo.diff
===================================================================
--- trunk/dports/textproc/word2vec/files/patch-demo.diff	2014-12-25 10:53:41 UTC (rev 130057)
+++ trunk/dports/textproc/word2vec/files/patch-demo.diff	2014-12-25 11:59:20 UTC (rev 130058)
@@ -1,116 +1,72 @@
---- demo-analogy.sh.orig	2014-02-22 20:36:04.000000000 +0900
-+++ demo-analogy.sh	2014-02-22 20:27:27.000000000 +0900
-@@ -1,11 +1,13 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
--  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
--  gzip -d text8.gz -f
-+  curl -O http://mattmahoney.net/dc/text8.zip
-+  unzip text8.zip
- fi
- echo -----------------------------------------------------------------------------------------------------
- echo Note that for the word analogy to perform well, the models should be trained on much larger data sets
+--- demo-analogy.sh.orig	2014-09-07 01:54:27.000000000 +0900
++++ demo-analogy.sh	2014-12-24 22:55:24.000000000 +0900
+@@ -7,5 +7,5 @@
+ echo Note that for the word analogy to perform well, the model should be trained on much larger data set
  echo Example input: paris france berlin
- echo -----------------------------------------------------------------------------------------------------
--time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+ echo ---------------------------------------------------------------------------------------------------
+-time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
 -./word-analogy vectors.bin
-+if [ ! -e vectors.bin ]; then
-+  time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-+fi
++time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
 + at EXECDIR@/word-analogy vectors.bin
---- demo-classes.sh.orig	2014-02-22 20:36:09.000000000 +0900
-+++ demo-classes.sh	2014-02-22 20:22:53.000000000 +0900
-@@ -1,8 +1,8 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
--  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
--  gzip -d text8.gz -f
-+  curl -O http://mattmahoney.net/dc/text8.zip
-+  unzip text8.zip
+--- demo-classes.sh.orig	2014-09-07 01:54:27.000000000 +0900
++++ demo-classes.sh	2014-12-24 22:57:00.000000000 +0900
+@@ -3,6 +3,6 @@
+   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+   gzip -d text8.gz -f
  fi
--time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
-+time @EXECDIR@/word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
+-time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
++time @EXECDIR@/word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
  sort classes.txt -k 2 -n > classes.sorted.txt
  echo The word classes were saved to file classes.sorted.txt
---- demo-phrase-accuracy.sh.orig	2014-02-22 20:36:25.000000000 +0900
-+++ demo-phrase-accuracy.sh	2014-02-22 20:29:40.000000000 +0900
-@@ -1,12 +1,14 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
--  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
--  gzip -d text8.gz -f
-+  curl -O http://mattmahoney.net/dc/text8.zip
-+  unzip text8.zip
+--- demo-phrase-accuracy.sh.orig	2014-09-07 01:54:27.000000000 +0900
++++ demo-phrase-accuracy.sh	2014-12-24 22:57:51.000000000 +0900
+@@ -4,8 +4,8 @@
+   gzip -d news.2012.en.shuffled.gz -f
  fi
- echo ----------------------------------------------------------------------------------------------------------------
- echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
- echo To achieve better accuracy, larger training set is needed
- echo ----------------------------------------------------------------------------------------------------------------
--time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
--time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
--./compute-accuracy vectors-phrase.bin <questions-phrases.txt
-+if [ ! -e vectors-phrase.bin ]; then
-+  time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
-+  time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
-+fi
-+ at EXECDIR@/compute-accuracy vectors-phrase.bin < @EXDIR@/questions-phrases.txt
---- demo-phrases.sh.orig	2014-02-22 20:36:17.000000000 +0900
-+++ demo-phrases.sh	2014-02-22 20:30:19.000000000 +0900
-@@ -1,8 +1,10 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
--  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
--  gzip -d text8.gz -f
-+  curl -O http://mattmahoney.net/dc/text8.zip
-+  unzip text8.zip
+ sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
+-time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
+-time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
++time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
++time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
+ tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
+-time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
+-./compute-accuracy vectors-phrase.bin < questions-phrases.txt
++time @EXECDIR@/word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
++ at EXECDIR@/compute-accuracy vectors-phrase.bin < questions-phrases.txt
+--- demo-phrases.sh.orig	2014-09-07 01:54:27.000000000 +0900
++++ demo-phrases.sh	2014-12-24 22:58:20.000000000 +0900
+@@ -4,8 +4,8 @@
+   gzip -d news.2012.en.shuffled.gz -f
  fi
--time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
--time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+ sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
+-time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
+-time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
++time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
++time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
+ tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
+-time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
 -./distance vectors-phrase.bin
-\ No newline at end of file
-+if [ ! -e vectors-phrase.bin ]; then
-+  time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
-+  time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-+fi
++time @EXECDIR@/word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
 + at EXECDIR@/distance vectors-phrase.bin
---- demo-word-accuracy.sh.orig	2014-02-22 20:36:32.000000000 +0900
-+++ demo-word-accuracy.sh	2014-02-22 20:31:16.000000000 +0900
-@@ -1,8 +1,10 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
--  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
--  gzip -d text8.gz -f
-+  curl -O http://mattmahoney.net/dc/text8.zip
-+  unzip text8.zip
+--- demo-word-accuracy.sh.orig	2014-09-07 01:54:27.000000000 +0900
++++ demo-word-accuracy.sh	2014-12-24 22:58:49.000000000 +0900
+@@ -3,6 +3,6 @@
+   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+   gzip -d text8.gz -f
  fi
--time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
 -./compute-accuracy vectors.bin 30000 < questions-words.txt
 -# to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
-+if [ ! -e vectors.bin ]; then
-+  time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-+fi
-+ at EXECDIR@/compute-accuracy vectors.bin 30000 < @EXDIR@/questions-words.txt
-+# to compute accuracy with the full vocabulary, use: @EXECDIR@/compute-accuracy vectors.bin < @EXDIR@/questions-words.txt
---- demo-word.sh.orig	2014-02-22 20:36:47.000000000 +0900
-+++ demo-word.sh	2014-02-22 20:31:57.000000000 +0900
-@@ -1,7 +1,9 @@
--make
-+#!/bin/sh
- if [ ! -e text8 ]; then
--  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
--  gzip -d text8.gz -f
-+  curl -O http://mattmahoney.net/dc/text8.zip
-+  unzip text8.zip
++time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
++ at EXECDIR@/compute-accuracy vectors.bin 30000 < questions-words.txt
++# to compute accuracy with the full vocabulary, use: @EXECDIR@/compute-accuracy vectors.bin < questions-words.txt
+--- demo-word.sh.orig	2014-09-07 01:54:27.000000000 +0900
++++ demo-word.sh	2014-12-24 22:59:00.000000000 +0900
+@@ -3,5 +3,5 @@
+   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+   gzip -d text8.gz -f
  fi
--time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
 -./distance vectors.bin
-\ No newline at end of file
-+if [ ! -e vectors.bin ]; then
-+  time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-+fi
++time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
 + at EXECDIR@/distance vectors.bin
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.macosforge.org/pipermail/macports-changes/attachments/20141225/fd35cf21/attachment.html>