Revision: 117348 https://trac.macports.org/changeset/117348 Author: hum@macports.org Date: 2014-02-23 06:36:18 -0800 (Sun, 23 Feb 2014) Log Message: ----------- New port: word2vec @20131218 - Tool for computing continuous distributed representations of words. Added Paths: ----------- trunk/dports/textproc/word2vec/ trunk/dports/textproc/word2vec/Portfile trunk/dports/textproc/word2vec/files/ trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff trunk/dports/textproc/word2vec/files/patch-demo.diff trunk/dports/textproc/word2vec/files/patch-malloc.diff Added: trunk/dports/textproc/word2vec/Portfile =================================================================== --- trunk/dports/textproc/word2vec/Portfile (rev 0) +++ trunk/dports/textproc/word2vec/Portfile 2014-02-23 14:36:18 UTC (rev 117348) @@ -0,0 +1,65 @@ +# -*- coding: utf-8; mode: tcl; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- vim:fenc=utf-8:ft=tcl:et:sw=4:ts=4:sts=4 +# $Id$ + +PortSystem 1.0 + +name word2vec +version 20131218 +categories textproc +maintainers hum openmaintainer + +description Tool for computing continuous distributed representations of words + +long_description This tool provides an efficient implementation of the \ + continuous bag-of-words and skip-gram architectures for \ + computing vector representations of words. These \ + representations can be subsequently used in many natural \ + language processing applications and for further research. + +homepage https://code.google.com/p/word2vec/ +platforms darwin +license Apache-2 + +fetch.type svn +svn.url http://word2vec.googlecode.com/svn/trunk +svn.revision 37 +worksrcdir trunk + +patchfiles patch-malloc.diff \ + patch-compute-accuracy.c.diff \ + patch-demo.diff + +use_configure no +variant universal {} + +configure.optflags -O2 + +build.args CC="${configure.cc}" \ + CFLAGS="${configure.cflags} [get_canonical_archflags] -lm -pthread -Wall -funroll-loops -Wunused-result" + +destroot { + set execdir ${prefix}/libexec/${name} + xinstall -d ${destroot}${execdir} + xinstall -m 755 -W ${worksrcpath} \ + word2vec word2phrase distance word-analogy compute-accuracy \ + demo-analogy.sh demo-classes.sh demo-phrase-accuracy.sh \ + demo-phrases.sh demo-word-accuracy.sh demo-word.sh \ + ${destroot}${execdir} + set exdir ${prefix}/share/examples/${name} + xinstall -d ${destroot}${exdir} + xinstall -m 644 -W ${worksrcpath} \ + questions-phrases.txt questions-words.txt \ + ${destroot}${exdir} + # fix demo scripts. + foreach f [glob ${destroot}${execdir}/demo-*.sh] { + reinplace "s|@EXECDIR@|${execdir}|g" ${f} + reinplace "s|@EXDIR@|${exdir}|g" ${f} + } + set docdir ${prefix}/share/doc/${name} + xinstall -d ${destroot}${docdir} + xinstall -m 644 -W ${worksrcpath} \ + LICENSE README.txt \ + ${destroot}${docdir} +} + +livecheck.url none Property changes on: trunk/dports/textproc/word2vec/Portfile ___________________________________________________________________ Added: svn:keywords + Id Added: svn:eol-style + native Added: trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff =================================================================== --- trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff (rev 0) +++ trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff 2014-02-23 14:36:18 UTC (rev 117348) @@ -0,0 +1,12 @@ +--- compute-accuracy.c.orig 2014-02-23 20:29:23.000000000 +0900 ++++ compute-accuracy.c 2014-02-23 20:30:44.000000000 +0900 +@@ -28,7 +28,8 @@ + FILE *f; + char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch; + float dist, len, bestd[N], vec[max_size]; +- long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; ++ long long words, size, b, c, d, b1, b2, b3, threshold = 0; ++ volatile long long a = 0; + float *M; + char *vocab; + int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0; Added: trunk/dports/textproc/word2vec/files/patch-demo.diff =================================================================== --- trunk/dports/textproc/word2vec/files/patch-demo.diff (rev 0) +++ trunk/dports/textproc/word2vec/files/patch-demo.diff 2014-02-23 14:36:18 UTC (rev 117348) @@ -0,0 +1,116 @@ +--- demo-analogy.sh.orig 2014-02-22 20:36:04.000000000 +0900 ++++ demo-analogy.sh 2014-02-22 20:27:27.000000000 +0900 +@@ -1,11 +1,13 @@ +-make ++#!/bin/sh + if [ ! -e text8 ]; then +- wget http://mattmahoney.net/dc/text8.zip -O text8.gz +- gzip -d text8.gz -f ++ curl -O http://mattmahoney.net/dc/text8.zip ++ unzip text8.zip + fi + echo ----------------------------------------------------------------------------------------------------- + echo Note that for the word analogy to perform well, the models should be trained on much larger data sets + echo Example input: paris france berlin + echo ----------------------------------------------------------------------------------------------------- +-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 +-./word-analogy vectors.bin ++if [ ! -e vectors.bin ]; then ++ time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 ++fi ++@EXECDIR@/word-analogy vectors.bin +--- demo-classes.sh.orig 2014-02-22 20:36:09.000000000 +0900 ++++ demo-classes.sh 2014-02-22 20:22:53.000000000 +0900 +@@ -1,8 +1,8 @@ +-make ++#!/bin/sh + if [ ! -e text8 ]; then +- wget http://mattmahoney.net/dc/text8.zip -O text8.gz +- gzip -d text8.gz -f ++ curl -O http://mattmahoney.net/dc/text8.zip ++ unzip text8.zip + fi +-time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500 ++time @EXECDIR@/word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500 + sort classes.txt -k 2 -n > classes.sorted.txt + echo The word classes were saved to file classes.sorted.txt +--- demo-phrase-accuracy.sh.orig 2014-02-22 20:36:25.000000000 +0900 ++++ demo-phrase-accuracy.sh 2014-02-22 20:29:40.000000000 +0900 +@@ -1,12 +1,14 @@ +-make ++#!/bin/sh + if [ ! -e text8 ]; then +- wget http://mattmahoney.net/dc/text8.zip -O text8.gz +- gzip -d text8.gz -f ++ curl -O http://mattmahoney.net/dc/text8.zip ++ unzip text8.zip + fi + echo ---------------------------------------------------------------------------------------------------------------- + echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus + echo To achieve better accuracy, larger training set is needed + echo ---------------------------------------------------------------------------------------------------------------- +-time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3 +-time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3 +-./compute-accuracy vectors-phrase.bin <questions-phrases.txt ++if [ ! -e vectors-phrase.bin ]; then ++ time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3 ++ time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3 ++fi ++@EXECDIR@/compute-accuracy vectors-phrase.bin < @EXDIR@/questions-phrases.txt +--- demo-phrases.sh.orig 2014-02-22 20:36:17.000000000 +0900 ++++ demo-phrases.sh 2014-02-22 20:30:19.000000000 +0900 +@@ -1,8 +1,10 @@ +-make ++#!/bin/sh + if [ ! -e text8 ]; then +- wget http://mattmahoney.net/dc/text8.zip -O text8.gz +- gzip -d text8.gz -f ++ curl -O http://mattmahoney.net/dc/text8.zip ++ unzip text8.zip + fi +-time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 +-time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 +-./distance vectors-phrase.bin +\ No newline at end of file ++if [ ! -e vectors-phrase.bin ]; then ++ time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 ++ time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 ++fi ++@EXECDIR@/distance vectors-phrase.bin +--- demo-word-accuracy.sh.orig 2014-02-22 20:36:32.000000000 +0900 ++++ demo-word-accuracy.sh 2014-02-22 20:31:16.000000000 +0900 +@@ -1,8 +1,10 @@ +-make ++#!/bin/sh + if [ ! -e text8 ]; then +- wget http://mattmahoney.net/dc/text8.zip -O text8.gz +- gzip -d text8.gz -f ++ curl -O http://mattmahoney.net/dc/text8.zip ++ unzip text8.zip + fi +-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 +-./compute-accuracy vectors.bin 30000 < questions-words.txt +-# to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt ++if [ ! -e vectors.bin ]; then ++ time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 ++fi ++@EXECDIR@/compute-accuracy vectors.bin 30000 < @EXDIR@/questions-words.txt ++# to compute accuracy with the full vocabulary, use: @EXECDIR@/compute-accuracy vectors.bin < @EXDIR@/questions-words.txt +--- demo-word.sh.orig 2014-02-22 20:36:47.000000000 +0900 ++++ demo-word.sh 2014-02-22 20:31:57.000000000 +0900 +@@ -1,7 +1,9 @@ +-make ++#!/bin/sh + if [ ! -e text8 ]; then +- wget http://mattmahoney.net/dc/text8.zip -O text8.gz +- gzip -d text8.gz -f ++ curl -O http://mattmahoney.net/dc/text8.zip ++ unzip text8.zip + fi +-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 +-./distance vectors.bin +\ No newline at end of file ++if [ ! -e vectors.bin ]; then ++ time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 ++fi ++@EXECDIR@/distance vectors.bin Added: trunk/dports/textproc/word2vec/files/patch-malloc.diff =================================================================== --- trunk/dports/textproc/word2vec/files/patch-malloc.diff (rev 0) +++ trunk/dports/textproc/word2vec/files/patch-malloc.diff 2014-02-23 14:36:18 UTC (rev 117348) @@ -0,0 +1,33 @@ +--- compute-accuracy.c.orig 2014-02-22 19:15:25.000000000 +0900 ++++ compute-accuracy.c 2014-02-22 19:17:40.000000000 +0900 +@@ -16,7 +16,7 @@ + #include <stdlib.h> + #include <string.h> + #include <math.h> +-#include <malloc.h> ++#include <stdlib.h> + #include <ctype.h> + + const long long max_size = 2000; // max length of strings +--- distance.c.orig 2014-02-22 19:15:32.000000000 +0900 ++++ distance.c 2014-02-22 19:16:29.000000000 +0900 +@@ -15,7 +15,7 @@ + #include <stdio.h> + #include <string.h> + #include <math.h> +-#include <malloc.h> ++#include <stdlib.h> + + const long long max_size = 2000; // max length of strings + const long long N = 40; // number of closest words that will be shown +--- word-analogy.c.orig 2014-02-22 19:15:49.000000000 +0900 ++++ word-analogy.c 2014-02-22 19:17:27.000000000 +0900 +@@ -15,7 +15,7 @@ + #include <stdio.h> + #include <string.h> + #include <math.h> +-#include <malloc.h> ++#include <stdlib.h> + + const long long max_size = 2000; // max length of strings + const long long N = 40; // number of closest words that will be shown
participants (1)
-
hum@macports.org