[117348] trunk/dports/textproc

Sun Feb 23 06:36:18 PST 2014

Revision: 117348
          https://trac.macports.org/changeset/117348
Author:   hum at macports.org
Date:     2014-02-23 06:36:18 -0800 (Sun, 23 Feb 2014)
Log Message:
-----------
New port: word2vec @20131218 - Tool for computing continuous distributed representations of words.

Added Paths:
-----------
    trunk/dports/textproc/word2vec/
    trunk/dports/textproc/word2vec/Portfile
    trunk/dports/textproc/word2vec/files/
    trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff
    trunk/dports/textproc/word2vec/files/patch-demo.diff
    trunk/dports/textproc/word2vec/files/patch-malloc.diff

Added: trunk/dports/textproc/word2vec/Portfile
===================================================================

--- trunk/dports/textproc/word2vec/Portfile	                        (rev 0)
+++ trunk/dports/textproc/word2vec/Portfile	2014-02-23 14:36:18 UTC (rev 117348)
@@ -0,0 +1,65 @@
+# -*- coding: utf-8; mode: tcl; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- vim:fenc=utf-8:ft=tcl:et:sw=4:ts=4:sts=4
+# $Id$
+
+PortSystem          1.0
+
+name                word2vec
+version             20131218
+categories          textproc
+maintainers         hum openmaintainer
+
+description         Tool for computing continuous distributed representations of words
+
+long_description    This tool provides an efficient implementation of the \
+                    continuous bag-of-words and skip-gram architectures for \
+                    computing vector representations of words. These \
+                    representations can be subsequently used in many natural \
+                    language processing applications and for further research.
+
+homepage            https://code.google.com/p/word2vec/
+platforms           darwin
+license             Apache-2
+
+fetch.type          svn
+svn.url             http://word2vec.googlecode.com/svn/trunk
+svn.revision        37
+worksrcdir          trunk
+
+patchfiles          patch-malloc.diff \
+                    patch-compute-accuracy.c.diff \
+                    patch-demo.diff
+
+use_configure       no
+variant universal   {}
+
+configure.optflags  -O2
+
+build.args          CC="${configure.cc}" \
+                    CFLAGS="${configure.cflags} [get_canonical_archflags] -lm -pthread -Wall -funroll-loops -Wunused-result"
+
+destroot {
+    set execdir ${prefix}/libexec/${name}
+    xinstall -d ${destroot}${execdir}
+    xinstall -m 755 -W ${worksrcpath} \
+        word2vec word2phrase distance word-analogy compute-accuracy \
+        demo-analogy.sh demo-classes.sh demo-phrase-accuracy.sh \
+        demo-phrases.sh demo-word-accuracy.sh demo-word.sh \
+        ${destroot}${execdir}
+    set exdir ${prefix}/share/examples/${name}
+    xinstall -d ${destroot}${exdir}
+    xinstall -m 644 -W ${worksrcpath} \
+        questions-phrases.txt questions-words.txt \
+        ${destroot}${exdir}
+    # fix demo scripts.
+    foreach f [glob ${destroot}${execdir}/demo-*.sh] {
+        reinplace "s|@EXECDIR@|${execdir}|g" ${f}
+        reinplace "s|@EXDIR@|${exdir}|g" ${f}
+    }
+    set docdir ${prefix}/share/doc/${name}
+    xinstall -d ${destroot}${docdir}
+    xinstall -m 644 -W ${worksrcpath} \
+        LICENSE README.txt \
+        ${destroot}${docdir}
+}
+
+livecheck.url       none


Property changes on: trunk/dports/textproc/word2vec/Portfile
___________________________________________________________________
Added: svn:keywords
   + Id
Added: svn:eol-style
   + native

Added: trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff
===================================================================
--- trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff	                        (rev 0)
+++ trunk/dports/textproc/word2vec/files/patch-compute-accuracy.c.diff	2014-02-23 14:36:18 UTC (rev 117348)
@@ -0,0 +1,12 @@
+--- compute-accuracy.c.orig	2014-02-23 20:29:23.000000000 +0900
++++ compute-accuracy.c	2014-02-23 20:30:44.000000000 +0900
+@@ -28,7 +28,8 @@
+   FILE *f;
+   char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
+   float dist, len, bestd[N], vec[max_size];
+-  long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
++  long long words, size, b, c, d, b1, b2, b3, threshold = 0;
++  volatile long long a = 0;
+   float *M;
+   char *vocab;
+   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;

Added: trunk/dports/textproc/word2vec/files/patch-demo.diff
===================================================================
--- trunk/dports/textproc/word2vec/files/patch-demo.diff	                        (rev 0)
+++ trunk/dports/textproc/word2vec/files/patch-demo.diff	2014-02-23 14:36:18 UTC (rev 117348)
@@ -0,0 +1,116 @@
+--- demo-analogy.sh.orig	2014-02-22 20:36:04.000000000 +0900
++++ demo-analogy.sh	2014-02-22 20:27:27.000000000 +0900
+@@ -1,11 +1,13 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+-  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+-  gzip -d text8.gz -f
++  curl -O http://mattmahoney.net/dc/text8.zip
++  unzip text8.zip
+ fi
+ echo -----------------------------------------------------------------------------------------------------
+ echo Note that for the word analogy to perform well, the models should be trained on much larger data sets
+ echo Example input: paris france berlin
+ echo -----------------------------------------------------------------------------------------------------
+-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-./word-analogy vectors.bin
++if [ ! -e vectors.bin ]; then
++  time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
++fi
++ at EXECDIR@/word-analogy vectors.bin
+--- demo-classes.sh.orig	2014-02-22 20:36:09.000000000 +0900
++++ demo-classes.sh	2014-02-22 20:22:53.000000000 +0900
+@@ -1,8 +1,8 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+-  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+-  gzip -d text8.gz -f
++  curl -O http://mattmahoney.net/dc/text8.zip
++  unzip text8.zip
+ fi
+-time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
++time @EXECDIR@/word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
+ sort classes.txt -k 2 -n > classes.sorted.txt
+ echo The word classes were saved to file classes.sorted.txt
+--- demo-phrase-accuracy.sh.orig	2014-02-22 20:36:25.000000000 +0900
++++ demo-phrase-accuracy.sh	2014-02-22 20:29:40.000000000 +0900
+@@ -1,12 +1,14 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+-  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+-  gzip -d text8.gz -f
++  curl -O http://mattmahoney.net/dc/text8.zip
++  unzip text8.zip
+ fi
+ echo ----------------------------------------------------------------------------------------------------------------
+ echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
+ echo To achieve better accuracy, larger training set is needed
+ echo ----------------------------------------------------------------------------------------------------------------
+-time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
+-time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
+-./compute-accuracy vectors-phrase.bin <questions-phrases.txt
++if [ ! -e vectors-phrase.bin ]; then
++  time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
++  time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
++fi
++ at EXECDIR@/compute-accuracy vectors-phrase.bin < @EXDIR@/questions-phrases.txt
+--- demo-phrases.sh.orig	2014-02-22 20:36:17.000000000 +0900
++++ demo-phrases.sh	2014-02-22 20:30:19.000000000 +0900
+@@ -1,8 +1,10 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+-  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+-  gzip -d text8.gz -f
++  curl -O http://mattmahoney.net/dc/text8.zip
++  unzip text8.zip
+ fi
+-time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
+-time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-./distance vectors-phrase.bin
+\ No newline at end of file
++if [ ! -e vectors-phrase.bin ]; then
++  time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
++  time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
++fi
++ at EXECDIR@/distance vectors-phrase.bin
+--- demo-word-accuracy.sh.orig	2014-02-22 20:36:32.000000000 +0900
++++ demo-word-accuracy.sh	2014-02-22 20:31:16.000000000 +0900
+@@ -1,8 +1,10 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+-  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+-  gzip -d text8.gz -f
++  curl -O http://mattmahoney.net/dc/text8.zip
++  unzip text8.zip
+ fi
+-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-./compute-accuracy vectors.bin 30000 < questions-words.txt
+-# to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
++if [ ! -e vectors.bin ]; then
++  time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
++fi
++ at EXECDIR@/compute-accuracy vectors.bin 30000 < @EXDIR@/questions-words.txt
++# to compute accuracy with the full vocabulary, use: @EXECDIR@/compute-accuracy vectors.bin < @EXDIR@/questions-words.txt
+--- demo-word.sh.orig	2014-02-22 20:36:47.000000000 +0900
++++ demo-word.sh	2014-02-22 20:31:57.000000000 +0900
+@@ -1,7 +1,9 @@
+-make
++#!/bin/sh
+ if [ ! -e text8 ]; then
+-  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
+-  gzip -d text8.gz -f
++  curl -O http://mattmahoney.net/dc/text8.zip
++  unzip text8.zip
+ fi
+-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
+-./distance vectors.bin
+\ No newline at end of file
++if [ ! -e vectors.bin ]; then
++  time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
++fi
++ at EXECDIR@/distance vectors.bin

Added: trunk/dports/textproc/word2vec/files/patch-malloc.diff
===================================================================
--- trunk/dports/textproc/word2vec/files/patch-malloc.diff	                        (rev 0)
+++ trunk/dports/textproc/word2vec/files/patch-malloc.diff	2014-02-23 14:36:18 UTC (rev 117348)
@@ -0,0 +1,33 @@
+--- compute-accuracy.c.orig	2014-02-22 19:15:25.000000000 +0900
++++ compute-accuracy.c	2014-02-22 19:17:40.000000000 +0900
+@@ -16,7 +16,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <math.h>
+-#include <malloc.h>
++#include <stdlib.h>
+ #include <ctype.h>
+ 
+ const long long max_size = 2000;         // max length of strings
+--- distance.c.orig	2014-02-22 19:15:32.000000000 +0900
++++ distance.c	2014-02-22 19:16:29.000000000 +0900
+@@ -15,7 +15,7 @@
+ #include <stdio.h>
+ #include <string.h>
+ #include <math.h>
+-#include <malloc.h>
++#include <stdlib.h>
+ 
+ const long long max_size = 2000;         // max length of strings
+ const long long N = 40;                  // number of closest words that will be shown
+--- word-analogy.c.orig	2014-02-22 19:15:49.000000000 +0900
++++ word-analogy.c	2014-02-22 19:17:27.000000000 +0900
+@@ -15,7 +15,7 @@
+ #include <stdio.h>
+ #include <string.h>
+ #include <math.h>
+-#include <malloc.h>
++#include <stdlib.h>
+ 
+ const long long max_size = 2000;         // max length of strings
+ const long long N = 40;                  // number of closest words that will be shown
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.macosforge.org/pipermail/macports-changes/attachments/20140223/a0d52bdd/attachment-0001.html>