create_characters.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. #!/usr/bin/python
  2. from os import listdir
  3. from GrayscaleImage import GrayscaleImage
  4. from NormalizedCharacterImage import NormalizedCharacterImage
  5. from Character import Character
  6. from data import IMAGES_FOLDER, exists, fload, fdump
  7. NORMALIZED_HEIGHT = 42
  8. def load_characters(neighbours, blur_scale, verbose=0):
  9. chars_file = 'characters_%s_%s.dat' % (blur_scale, neighbours)
  10. if exists(chars_file):
  11. print 'Loading characters...'
  12. chars = fload(chars_file)
  13. else:
  14. print 'Going to generate character objects...'
  15. chars = []
  16. for char in sorted(listdir(IMAGES_FOLDER)):
  17. count = 0
  18. for image in sorted(listdir(IMAGES_FOLDER + char)):
  19. image = GrayscaleImage(IMAGES_FOLDER + char + '/' + image)
  20. norm = NormalizedCharacterImage(image, blur=blur_scale, \
  21. height=NORMALIZED_HEIGHT)
  22. character = Character(char, [], norm)
  23. character.get_single_cell_feature_vector(neighbours)
  24. chars.append(character)
  25. count += 1
  26. if verbose:
  27. print 'Loaded character %s %d times' % (char, count)
  28. if verbose:
  29. print 'Saving characters...'
  30. fdump(chars, chars_file)
  31. return chars
  32. def load_learning_set(neighbours, blur_scale, verbose=0):
  33. learning_set_file = 'learning_set_%s_%s.dat' % (blur_scale, neighbours)
  34. if exists(learning_set_file):
  35. if verbose:
  36. print 'Loading learning set...'
  37. learning_set = fload(learning_set_file)
  38. if verbose:
  39. print 'Learning set:', [c.value for c in learning_set]
  40. else:
  41. learning_set = generate_sets(neighbours, blur_scale, \
  42. verbose=verbose)[0]
  43. return learning_set
  44. def load_test_set(neighbours, blur_scale, verbose=0):
  45. test_set_file = 'test_set_%s_%s.dat' % (blur_scale, neighbours)
  46. if exists(test_set_file):
  47. if verbose:
  48. print 'Loading test set...'
  49. test_set = fload(test_set_file)
  50. if verbose:
  51. print 'Test set:', [c.value for c in test_set]
  52. else:
  53. test_set = generate_sets(neighbours, blur_scale, verbose=verbose)[1]
  54. return test_set
  55. def generate_sets(neighbours, blur_scale, verbose=0):
  56. """Split the entire dataset into a trainingset and a testset."""
  57. suffix = '_%s_%s' % (blur_scale, neighbours)
  58. learning_set_file = 'learning_set%s.dat' % suffix
  59. test_set_file = 'test_set%s.dat' % suffix
  60. chars = load_characters(neighbours, blur_scale, verbose=verbose)
  61. if verbose:
  62. print 'Going to generate learning set and test set...'
  63. learning_set = []
  64. test_set = []
  65. learned = []
  66. for char in chars:
  67. if learned.count(char.value) == 70:
  68. test_set.append(char)
  69. else:
  70. learning_set.append(char)
  71. learned.append(char.value)
  72. if verbose:
  73. print 'Learning set:', [c.value for c in learning_set]
  74. print '\nTest set:', [c.value for c in test_set]
  75. print '\nSaving learning set...'
  76. fdump(learning_set, learning_set_file)
  77. if verbose:
  78. print 'Saving test set...'
  79. fdump(test_set, test_set_file)
  80. return learning_set, test_set
  81. if __name__ == '__main__':
  82. from sys import argv, exit
  83. if len(argv) < 3:
  84. print 'Usage: python %s NEIGHBOURS BLUR_SCALE' % argv[0]
  85. exit(1)
  86. neighbours = int(argv[1])
  87. blur_scale = float(argv[2])
  88. # Generate the character file and the learning set/test set files
  89. load_learning_set(neighbours, blur_scale, verbose=1)
  90. load_test_set(neighbours, blur_scale, verbose=1)