zoom1 zoom2 zoom3

philippewang.info

unique

(written in Objective Caml)

This program outputs to stdout what is read from stdin, removing duplicated lines.

It is supposed to be efficient but RAM consuming (linear to the number of different lines). More precisely, it builds a balanced tree containing each different line read to know whether a line had been read before or not.

If ever you want the lines sorted, or if you don't mind about it, use sort -u

If you want to use it without compiling it to a binary file, I suggest you add this line at the very beginning of the file :
#!/usr/bin/env ocaml
(but you won't be able to compile it with ocamlc or ocamlopt with this line)

version 2

This version does not bufferize the output : everything is flushed quickly.
(************************************************************************)
(* (c) 2006 Philippe Wang  ( mail@philippewang.info )                   *)
(*                                                                      *)
(* This program outputs to stdout what is read from stdin,              *)
(* removing duplicated lines                                            *)
(*                                                                      *)
(*  GPL >= 2                                                            *)
(*                                                                      *)
(************************************************************************)

(* $Id: script.unique.whp,v 1.2 2007/08/19 09:40:48 philippeb Exp $ *)

module M = Set.Make (String)
module Q = Queue
open M

let tree = ref empty

let () =
  if Array.length Sys.argv > 1 then
    begin
      print_string Sys.argv.(0);
      print_string ("
usage: "^Sys.argv.(0)^"

outputs to stdout what is read from stdin, removing duplicated lines

N.B. you may need a lot of RAM if you give a lot of different lines!
");
      exit 1
    end;
  try while true do
    let l = read_line () in
      if mem l !tree 
      then ()
      else 
        begin
          tree := add l !tree;
          print_string l;
          print_char '\n'
        end
  done with End_of_file -> exit 0

(* end of unique.ml                                              *)

version 1

This version is bufferized: it uses more RAM but does no output before it has reached the end of input.
(************************************************************************)
(* (c) 2006 Philippe Wang  ( mail@philippewang.info )                   *)
(*                                                                      *)
(* This program outputs to stdout what is read from stdin,              *)
(* removing duplicated lines                                            *)
(*                                                                      *)
(*  GPL >= 2                                                            *)
(*                                                                      *)
(************************************************************************)

(* $Id: script.unique.whp,v 1.2 2007/08/19 09:40:48 philippeb Exp $ *)

module M = Set.Make (String)
module Q = Queue
open M

let tree = ref empty

let lines : string Q.t = Q.create ()

let () =
  if Array.length Sys.argv > 1 then
    begin
      print_string Sys.argv.(0);
      print_string ("
usage: "^Sys.argv.(0)^"

outputs to stdout what is read from stdin, removing duplicated lines

N.B. you may need a lot of RAM if you give a lot of different lines!
");
      exit 1
    end;
  try while true do
    let l = read_line () in
      if mem l !tree 
      then ()
      else 
        begin
          tree := add l !tree;
          Q.add l lines;
        end
  done with End_of_file ->
    Q.iter (fun s -> print_string s ; print_char '\n') lines

(* end of unique.ml                                              *)

:: philippewang.info ::

:: design & photos by Philippe Wang :: XHTML 1.1 :: CSS 2 :: RSS 2 :: stats :: contact ::
:: Best viewed with Safari or Opera or Firefox or Links :: No SPAM Please ::
 
This page was generated on Sun Nov 18 16:58:32 CET 2007 by BashGXD