|
15 | 15 | */ |
16 | 16 | package dev.morling.onebrc; |
17 | 17 |
|
18 | | -import java.io.BufferedReader; |
19 | 18 | import java.io.BufferedWriter; |
20 | | -import java.io.FileReader; |
21 | 19 | import java.io.FileWriter; |
22 | | -import java.io.IOException; |
23 | | -import java.io.StringReader; |
24 | | -import java.nio.charset.StandardCharsets; |
25 | | -import java.util.ArrayList; |
26 | | -import java.util.HashSet; |
27 | | -import java.util.concurrent.ThreadLocalRandom; |
28 | 20 |
|
29 | | -public class CreateMeasurements3 { |
| 21 | +import org.rschwietzke.FastRandom; |
30 | 22 |
|
31 | | - public static final int MAX_NAME_LEN = 100; |
32 | | - public static final int KEYSET_SIZE = 10_000; |
| 23 | +public class CreateMeasurements3 { |
33 | 24 |
|
34 | 25 | public static void main(String[] args) throws Exception { |
35 | | - if (args.length != 1) { |
36 | | - System.out.println("Usage: create_measurements3.sh <number of records to create>"); |
| 26 | + if (args.length < 1) { |
| 27 | + System.out.println("Usage: create_measurements3.sh <number of records to create> [seed]"); |
37 | 28 | System.exit(1); |
38 | 29 | } |
| 30 | + |
39 | 31 | int size = 0; |
40 | 32 | try { |
41 | 33 | size = Integer.parseInt(args[0]); |
42 | 34 | } |
43 | 35 | catch (NumberFormatException e) { |
44 | 36 | System.out.println("Invalid value for <number of records to create>"); |
45 | | - System.out.println("Usage: create_measurements3.sh <number of records to create>"); |
| 37 | + System.out.println("Usage: create_measurements3.sh <number of records to create> [seed]"); |
46 | 38 | System.exit(1); |
47 | 39 | } |
48 | | - final var weatherStations = generateWeatherStations(); |
| 40 | + |
| 41 | + // Default seed is 1brc1brc converted to hexadecimal |
| 42 | + long seed = 0x3162726331627263L; |
| 43 | + if (args.length == 2) { |
| 44 | + try { |
| 45 | + seed = Long.parseLong(args[1]); |
| 46 | + } |
| 47 | + catch (NumberFormatException e) { |
| 48 | + System.out.println("Invalid value for [seed]"); |
| 49 | + System.out.println("Usage: CreateMeasurements2 <number of records to create> [seed]"); |
| 50 | + System.exit(1); |
| 51 | + } |
| 52 | + } |
| 53 | + |
| 54 | + final var weatherStations = WeatherStationFactory.getWeatherStationsList(seed); |
49 | 55 | final var start = System.currentTimeMillis(); |
50 | | - final var rnd = ThreadLocalRandom.current(); |
| 56 | + final var rnd = new FastRandom(seed); |
51 | 57 | try (var out = new BufferedWriter(new FileWriter("measurements.txt"))) { |
52 | 58 | for (int i = 1; i <= size; i++) { |
53 | | - var station = weatherStations.get(rnd.nextInt(KEYSET_SIZE)); |
54 | | - double temp = rnd.nextGaussian(station.avgTemp, 7.0); |
55 | | - out.write(station.name); |
| 59 | + var station = weatherStations.get(rnd.nextInt(weatherStations.size())); |
| 60 | + double temp = station.measurement(); |
| 61 | + out.write(station.id); |
56 | 62 | out.write(';'); |
57 | | - out.write(Double.toString(Math.round(temp * 10.0) / 10.0)); |
58 | | - out.newLine(); |
| 63 | + out.write(Double.toString(temp)); |
| 64 | + out.write('\n'); |
59 | 65 | if (i % 50_000_000 == 0) { |
60 | 66 | System.out.printf("Wrote %,d measurements in %,d ms%n", i, System.currentTimeMillis() - start); |
61 | 67 | } |
62 | 68 | } |
63 | 69 | } |
64 | 70 | } |
65 | | - |
66 | | - record WeatherStation(String name, float avgTemp) { |
67 | | - } |
68 | | - |
69 | | - private static ArrayList<WeatherStation> generateWeatherStations() throws Exception { |
70 | | - // Use a public list of city names and concatenate them all into a long string, |
71 | | - // which we'll use as a "source of city name randomness" |
72 | | - var bigName = new StringBuilder(1 << 20); |
73 | | - try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"));) { |
74 | | - skipComments(rows); |
75 | | - while (true) { |
76 | | - var row = rows.readLine(); |
77 | | - if (row == null) { |
78 | | - break; |
79 | | - } |
80 | | - bigName.append(row, 0, row.indexOf(';')); |
81 | | - } |
82 | | - } |
83 | | - final var weatherStations = new ArrayList<WeatherStation>(); |
84 | | - final var names = new HashSet<String>(); |
85 | | - var minLen = Integer.MAX_VALUE; |
86 | | - var maxLen = Integer.MIN_VALUE; |
87 | | - try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"))) { |
88 | | - skipComments(rows); |
89 | | - final var nameSource = new StringReader(bigName.toString()); |
90 | | - final var buf = new char[MAX_NAME_LEN]; |
91 | | - final var rnd = ThreadLocalRandom.current(); |
92 | | - final double yOffset = 4; |
93 | | - final double factor = 2500; |
94 | | - final double xOffset = 0.372; |
95 | | - final double power = 7; |
96 | | - for (int i = 0; i < KEYSET_SIZE; i++) { |
97 | | - var row = rows.readLine(); |
98 | | - if (row == null) { |
99 | | - break; |
100 | | - } |
101 | | - // Use a 7th-order curve to simulate the name length distribution. |
102 | | - // It gives us mostly short names, but with large outliers. |
103 | | - var nameLen = (int) (yOffset + factor * Math.pow(rnd.nextDouble() - xOffset, power)); |
104 | | - var count = nameSource.read(buf, 0, nameLen); |
105 | | - if (count == -1) { |
106 | | - throw new Exception("Name source exhausted"); |
107 | | - } |
108 | | - var nameBuf = new StringBuilder(nameLen); |
109 | | - nameBuf.append(buf, 0, nameLen); |
110 | | - if (Character.isWhitespace(nameBuf.charAt(0))) { |
111 | | - nameBuf.setCharAt(0, readNonSpace(nameSource)); |
112 | | - } |
113 | | - if (Character.isWhitespace(nameBuf.charAt(nameBuf.length() - 1))) { |
114 | | - nameBuf.setCharAt(nameBuf.length() - 1, readNonSpace(nameSource)); |
115 | | - } |
116 | | - var name = nameBuf.toString(); |
117 | | - while (names.contains(name)) { |
118 | | - nameBuf.setCharAt(rnd.nextInt(nameBuf.length()), readNonSpace(nameSource)); |
119 | | - name = nameBuf.toString(); |
120 | | - } |
121 | | - int actualLen; |
122 | | - while (true) { |
123 | | - actualLen = name.getBytes(StandardCharsets.UTF_8).length; |
124 | | - if (actualLen <= 100) { |
125 | | - break; |
126 | | - } |
127 | | - nameBuf.deleteCharAt(nameBuf.length() - 1); |
128 | | - if (Character.isWhitespace(nameBuf.charAt(nameBuf.length() - 1))) { |
129 | | - nameBuf.setCharAt(nameBuf.length() - 1, readNonSpace(nameSource)); |
130 | | - } |
131 | | - name = nameBuf.toString(); |
132 | | - } |
133 | | - if (name.indexOf(';') != -1) { |
134 | | - throw new Exception("Station name contains a semicolon!"); |
135 | | - } |
136 | | - names.add(name); |
137 | | - minLen = Integer.min(minLen, actualLen); |
138 | | - maxLen = Integer.max(maxLen, actualLen); |
139 | | - var lat = Float.parseFloat(row.substring(row.indexOf(';') + 1)); |
140 | | - // Guesstimate mean temperature using cosine of latitude |
141 | | - var avgTemp = (float) (30 * Math.cos(Math.toRadians(lat))) - 10; |
142 | | - weatherStations.add(new WeatherStation(name, avgTemp)); |
143 | | - } |
144 | | - } |
145 | | - System.out.format("Generated %,d station names with length from %,d to %,d%n", KEYSET_SIZE, minLen, maxLen); |
146 | | - return weatherStations; |
147 | | - } |
148 | | - |
149 | | - private static void skipComments(BufferedReader rows) throws IOException { |
150 | | - while (rows.readLine().startsWith("#")) { |
151 | | - } |
152 | | - } |
153 | | - |
154 | | - private static char readNonSpace(StringReader nameSource) throws IOException { |
155 | | - while (true) { |
156 | | - var n = nameSource.read(); |
157 | | - if (n == -1) { |
158 | | - throw new IOException("Name source exhausted"); |
159 | | - } |
160 | | - var ch = (char) n; |
161 | | - if (ch != ' ') { |
162 | | - return ch; |
163 | | - } |
164 | | - } |
165 | | - } |
166 | 71 | } |
0 commit comments